In [1]:
import re, pickle, os, string
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np 
import pandas as pd 
import nltk
import string
import spacy
from nltk.corpus import stopwords
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex
from spacy.tokenizer import Tokenizer
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
from gensim.models.doc2vec import Doc2Vec
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

In [2]:
def load_pickle(filepath):
    documents_f = open(filepath, 'rb')
    file = pickle.load(documents_f)
    documents_f.close()
    
    return file

def save_pickle(data, filepath):
    save_documents = open(filepath, 'wb')
    pickle.dump(data, save_documents)
    save_documents.close()

In [3]:
train_data = load_pickle("train_data.p")

In [4]:
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(train_data['Tweet'], train_data['Author'], test_size=0.1, random_state=10)

In [5]:
train, test = train_test_split(train_data, test_size=0.3, random_state=42)

In [6]:
from gensim.models.doc2vec import TaggedDocument
def split(text):
    return text.split()

train_tagged = train.apply(lambda r: TaggedDocument(words=split(r['transform']), tags=[str(r.Author)]), axis=1)
test_tagged = test.apply(lambda r: TaggedDocument(words=split(r['transform']), tags=[str(r.Author)]), axis=1)

In [7]:
import multiprocessing
cores = multiprocessing.cpu_count()
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in train_tagged.values])

In [8]:
from sklearn import utils
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 229736/229736 [00:00<00:00, 3720539.88it/s]
100%|██████████| 229736/229736 [00:00<00:00, 3372412.72it/s]
100%|██████████| 229736/229736 [00:00<00:00, 3069780.54it/s]
100%|██████████| 229736/229736 [00:00<00:00, 3092827.00it/s]
100%|██████████| 229736/229736 [00:00<00:00, 2650000.62it/s]
100%|██████████| 229736/229736 [00:00<00:00, 3270628.05it/s]
100%|██████████| 229736/229736 [00:00<00:00, 2984632.47it/s]
100%|██████████| 229736/229736 [00:00<00:00, 3431048.86it/s]
100%|██████████| 229736/229736 [00:00<00:00, 3580387.86it/s]
100%|██████████| 229736/229736 [00:00<00:00, 3477319.51it/s]
100%|██████████| 229736/229736 [00:00<00:00, 3420380.04it/s]
100%|██████████| 229736/229736 [00:00<00:00, 3526751.69it/s]
100%|██████████| 229736/229736 [00:00<00:00, 3401928.44it/s]
100%|██████████| 229736/229736 [00:00<00:00, 3466835.37it/s]
100%|██████████| 229736/229736 [00:00<00:00, 2834641.81it/s]
100%|██████████| 229736/229736 [00:00<00:00, 3316911.26it/s]
100%|██████████| 229736/

In [9]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [10]:
%%time
from sklearn.linear_model import LogisticRegression
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

CPU times: user 2min 5s, sys: 787 ms, total: 2min 6s
Wall time: 2min 7s


In [None]:
logreg = LogisticRegression(n_jobs=-1, C=1e5, verbose=10)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
from sklearn.metrics import accuracy_score, f1_score
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

  " = {}.".format(effective_n_jobs(self.n_jobs)))


[LibLinear]

In [None]:
save_pickle(logreg, os.path.join('1st model doc2vec.p'))

## Different model

In [None]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

In [None]:
%%time
for epoch in range(30):
    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha


In [None]:
%%time
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

In [None]:
save_pickle(logreg, os.path.join('2nd model doc2vec.p'))

## Combine two models

In [None]:
model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [None]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

In [None]:
def get_vectors(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [None]:
y_train, X_train = get_vectors(new_model, train_tagged)
y_test, X_test = get_vectors(new_model, test_tagged)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

In [None]:
save_pickle(logreg, os.path.join('3rd model doc2vec.p'))