In [None]:
# Imports
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
import multiprocessing
import csv
import gensim
import nltk
import pandas as pd
import numpy as np
import re
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.models import load_model
from keras.callbacks import ModelCheckpoint, EarlyStopping
import optuna
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn import utils
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from tqdm import tqdm

In [None]:
tqdm.pandas(desc="progress-bar")
cats = ['tag', 'tweet']
df = pd.read_csv("tweets.csv", header=None, usecols=[0, 5], names=cats, encoding='latin-1')
clean_df = pd.read_csv("cleaned_tweets.csv", encoding='latin-1', names=cats)
df['tag'] = df['tag'].map({0: 0, 4: 1})
wpt = WordPunctTokenizer()
train_corpus = list()
cores = multiprocessing.cpu_count()

In [None]:
model_name_dbow = 'doc2vecmodel_v2.model'
model_name_dmm = 'doc2vecmodel_v2_dmm.model'

In [None]:
def clean_tweet(tweet):
    mention_remove = r'@[A-Za-z0-9_]+'
    link_remove = r'https?://[^ ]+'
    www_remove = r'www.[^ ]+'
    replace_neg_dict = {"isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not",
                        "haven't": "have not", "hasn't": "has not", "hadn't": "had not", "won't": "will not",
                        "wouldn't": "would not", "don't": "do not", "doesn't": "does not", "didn't": "did not",
                        "can't": "can not", "couldn't": "could not", "shouldn't": "should not", "mightn't": "might not",
                        "mustn't": "must not"}
    neg_replace = re.compile(r'\b(' + '|'.join(replace_neg_dict.keys()) + r')\b')

    soup = BeautifulSoup(tweet, 'lxml')
    nohtml = soup.get_text()

    try:
        cleaned = nohtml.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        cleaned = nohtml

    cleaned = re.sub(mention_remove + '|' + link_remove, '', cleaned)
    cleaned = re.sub(www_remove, '', cleaned)
    cleaned = cleaned.lower()
    cleaned = neg_replace.sub(lambda x: replace_neg_dict[x.group()], cleaned)
    cleaned = re.sub("[^a-zA-Z]", " ", cleaned)

    words = []

    for x in wpt.tokenize(cleaned):
        if len(x) > 1:
            words.append(x)

    return (" ".join(words)).strip()

In [None]:
def clean_all_tweets():
    print("Cleaning tweets...\n")
    clean_tweets = []
    for i in range(0, len(df)):
        if (i + 1) % 100000 == 0:
            print("Tweets %d of %d have been cleaned" % (i + 1, len(df)))
        clean_tweets.append(clean_tweet(df.tweet[i]))

    cleaned_collection = pd.DataFrame(clean_tweets, columns=['tweet'])
    cleaned_collection['tag'] = df.tag
    cleaned_collection = cleaned_collection[['tag', 'tweet']]
    cleaned_collection.to_csv('cleaned_tweets_wsw_v2.csv', encoding='latin-1', index=False, quoting=csv.QUOTE_ALL,
                              header=False)

In [None]:
def load_tweets(tweets):
    clean_df.dropna(inplace=True)
    clean_df.reset_index(drop=True, inplace=True)

In [None]:
def create_tweet_samples(tweets, size):
    set_size = (size // 2)
    print(set_size)
    set1 = tweets[tweets.tag == 0].sample(n=(set_size))
    set2 = tweets[tweets.tag == 1].sample(n=(set_size))
    
    all_set = set1.append(set2)
    all_set.reset_index(drop=True, inplace=True)
    
    sample_title = 'tweets_sample_' + str(size) + '.csv'
    
    all_set.to_csv(sample_title, encoding='latin-1', index=False, quoting=csv.QUOTE_ALL, header=False)

In [None]:
def tokenize_tweet(tweet):
    tokens = []
    for sent in nltk.sent_tokenize(tweet):
        for word in nltk.word_tokenize(sent):
            if len(word) < 1:
                continue
            tokens.append(word.lower())
    return tokens

In [None]:
def make_tagged_docs(tweets):
    train_corpus.clear()
    for i in range(0, len(tweets)):
        if (i + 1) % 10000 == 0:
            print("Tweets %d of %d have been tagged" % (i + 1, len(df)))
        tagged_doc = TaggedDocument(tokenize_tweet(clean_df.tweet[i]), [i])
        train_corpus.append(tagged_doc)
    print('DOCS ALL TAGGED')
    return

In [None]:
def model_to_vec(docvecmodel, tweets, dim):
    vectors = np.zeros((len(tweets), dim))
    n = 0
    for m in tweets.index:
        vectors[n] = docvecmodel.docvecs[m]
        n += 1
    return vectors

In [None]:
def model_combine(model1, model2, tweets, dim):
    vectors = np.zeros((len(tweets), dim))
    n = 0
    for m in tweets.index:
        vectors[n] = np.append(model1.docvecs[m],model2.docvecs[m])
        n += 1
    return vectors

In [None]:
def average_vectors(x_vectors, y_values, array_size):
    pos_tweets = np.zeros((sum(y_values.values == 1), array_size))
    neg_tweets = np.zeros((sum(y_values.values == 0), array_size))

    m = 0
    n = 0
    o = 0
    for i, j in zip(x_vectors, y_values):
        if j == 0:
            neg_tweets[m] = x_vectors[n]
            m += 1
            n += 1
        else:
            pos_tweets[o] = x_vectors[n]
            o += 1
            n += 1

    pos_avg = np.average(pos_tweets, axis=0)
    neg_avg = np.average(neg_tweets, axis=0)

    return pos_avg, neg_avg

In [None]:
def cosine_sim(pos, neg, tweetvec, tags, array_size):
    match_arr = list()
    pos = pos.reshape(1, array_size)
    neg = neg.reshape(1, array_size)

    for i, j in zip(tweetvec, tags):
        i = i.reshape(1, array_size)
        pos_sim = cosine_similarity(pos, i)[0][0]
        neg_sim = cosine_similarity(neg, i)[0][0]
        cos_tag = 0
        match = False

        if pos_sim > neg_sim:
            cos_tag = 1
        else:
            cos_tag = 0
        
        if j == cos_tag:
            match = True
            match_arr.append(match)
        else:
            match = False
            match_arr.append(match)

    sum_of_matches = sum(match_arr)
    accuracy = sum_of_matches/len(tags)

    return accuracy

In [None]:
def cosine_sim_analyse(tweet, pos, neg, array_size):
    pos = pos.reshape(1, array_size)
    neg = neg.reshape(1, array_size)

    tweet = tweet.reshape(1, array_size)
    pos_sim = cosine_similarity(pos, tweet)[0][0]
    neg_sim = cosine_similarity(neg, tweet)[0][0]
    
    if pos_sim > neg_sim:
        return 1
    else:
        return 0

In [None]:
def train_dbow_d2v(model_name, dim):
    model_dbow = Doc2Vec(dm=0, vector_size=dim, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
    model_dbow.build_vocab([x for x in tqdm(train_corpus)])
    print('DONE BUILDING DBOW VOCAB')

    for epoch in range(30):
        model_dbow.train(utils.shuffle([x for x in tqdm(train_corpus)]), total_examples=len(train_corpus), epochs=1)
        model_dbow.alpha -= 0.002
        model_dbow.min_alpha = model_dbow.alpha

    print('TRAINED DBOW DOC2VEC MODEL SUCCESSFULLY')
    
    model_dbow.save(model_name)
    print('SAVED DBOW MODEL\n')

In [None]:
def train_dmm_d2v(model_name, dim):
    model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=dim, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)

    model_dmm.build_vocab([x for x in tqdm(train_corpus)])
    print('DONE BUILDING DMM VOCAB')

    for epoch in range(30):
        model_dmm.train(utils.shuffle([x for x in tqdm(train_corpus)]), total_examples=len(train_corpus), epochs=1)
        model_dmm.alpha -= 0.002
        model_dmm.min_alpha = model_dmm.alpha

    print('TRAINED DMM DOC2VEC MODEL SUCCESSFULLY')
    
    model_dmm.save(model_name)
    print('SAVED DMM MODEL\n')

In [None]:
def get_wordvectors(model, tweet, dim):
    vector = np.zeros((dim))
    vector = vector.reshape(1, dim)
    count = 0.0
    
    for i in tweet.split():
        try:
            vector += model[i].reshape(1, dim)
            count += 1.0
        except KeyError:
            continue
    
    if count != 0:
        vector = vector / count

    return vector

In [None]:
def extract_wordvec(model, dim):
    
    train_vec = np.concatenate([get_wordvectors(model, x, dim) for x in x_train])
    validation_vec = np.concatenate([get_wordvectors(model, x, dim) for x in x_validation])
    test_vec = np.concatenate([get_wordvectors(model, x, dim) for x in x_test])
    
    return train_vec, validation_vec, train_vec

In [None]:
def model_eval(model_1, dim, model_2=None):

    if model_2!=None:
        train_vec = model_combine(model_1, model_2, x_train, dim)
        val_vec = model_combine(model_1, model_2, x_validation, dim)
        test_vec = model_combine(model_1, model_2, x_test, dim)
    else:
        train_vec = model_to_vec(model_1, x_train, dim)
        val_vec = model_to_vec(model_1, x_validation, dim)
        test_vec = model_to_vec(model_1, x_test, dim)
    
    print('MODEL EVALUATION REPORT:')
    print('-----------------------------------')
   
    lr = LogisticRegression()
    lr.fit(train_vec, y_train)
    lr_acc_score = lr.score(val_vec, y_validation)
    lr_acc_score = str(lr_acc_score)
    print('Logistic Regression Score: ' + lr_acc_score)
    print('     ----------     ')

    posavg, negavg = average_vectors(train_vec, y_train, dim)
    cos_similarity = cosine_sim(posavg, negavg, val_vec, y_validation, dim)
    cs_acc_score = str(cos_similarity)
    print('Cosine Similarity Score: ' + cs_acc_score)
    print('     ----------     ')
    
    lda = LinearDiscriminantAnalysis(n_components=1)
    lda.fit(train_vec, y_train)
    lda_acc_score = lda.score(val_vec, y_validation)
    lda_acc_score = str(lda_acc_score)
    print('Linear Discriminant Analysis Score: ' + lda_acc_score)
    print('     ----------     ')

    return lr, posavg, negavg, lda

In [None]:
def get_sentiment(tweet, model_1, dim, lr, posavg, negavg, lda, model_2=None):
    cleaned = clean_tweet(tweet)
    tokenized = tokenize_tweet(cleaned)
    array_size = dim

    if model_2!=None:
        vector1 = model_1.infer_vector(doc_words=tokenized, epochs=30, alpha=0.065)
        vector2 = model_2.infer_vector(doc_words=tokenized, epochs=30, alpha=0.065)
        vector = np.append(vector1,vector2)

    else:
        vector = model_1.infer_vector(doc_words=tokenized, epochs=30, alpha=0.065)


    print('SENTIMENT ANALYSIS REPORT:')
    print('-----------------------------------')
    
    lr_score = lr.predict([vector])
    lr_score = re.findall(r'\d+', str(lr_score))
    lr_score = lr_score[0]
    print('Logistic Regression estimate: ' + lr_score)
    print('     ----------     ')

    cs_score = cosine_sim_analyse(vector, posavg, negavg, array_size)
    cs_score = str(cs_score)
    print('Cosine Similarity estimate: ' + cs_score)
    print('     ----------     ')
    
    lda_score = lda.predict([vector])
    lda_score = re.findall(r'\d+', str(lda_score))
    lda_score = lda_score[0]
    print('Linear Discriminant Analysis estimate: ' + lda_score)
    print('     ----------     ')

    return

In [None]:
clean_all_tweets()

In [None]:
load_tweets(clean_df)
x = clean_df.tweet
y = clean_df.tag
x_train, x_remain, y_train, y_remain = train_test_split(x, y, test_size=.02, random_state=2000) # (98:2 TRAIN:REMAIN SPLIT) 
x_validation, x_test, y_validation, y_test = train_test_split(x_remain, y_remain, test_size=.5, random_state=2000) # (50:50 VAL:TEST SPLIT)

In [None]:
make_tagged_docs(clean_df)

In [None]:
train_dbow_d2v(model_name_dbow, 100) [1.6 Million tweets, 100 Dimensions]
train_dmm_d2v(model_name_dmm, 100) [1.6 Million tweets, 100 Dimensions]

In [None]:
model_dbow = Doc2Vec.load(model_name_dbow) # This is my base-line DBOW model
print('LOADED DBOW MODEL [1.6 Million tweets, 100 Dimensions]') 

In [None]:
model_dmm = Doc2Vec.load(model_name_dmm) # This is my base-line DMM Model
print('LOADED DMM MODEL [1.6 Million tweets, 100 Dimensions]')

In [None]:
lr_dbow_100, pos_avg_dbow_100, neg_avg_dbow_100, lda_dbow_100 = model_eval(model_dbow, 100)
# LR Accuracy: 0.7413533834586467 for DBOW
# CS Accuracy: 0.7291979949874686 for DBOW
# LDA Accuracy: 0.7411654135338346 for DBOW

In [None]:
lr_dmm_100, pos_avg_dmm_100, neg_avg_dmm_100, lda_dmm_100 = model_eval(model_dmm, 100)
# LR Accuracy: 0.7288220551378446 for DMM
# CS Accuracy: 0.7229949874686716 for DMM
# LDA Accuracy: 0.7280075187969924 for DMM

In [None]:
lr_combo_200, pos_avg_combo_200, neg_avg_combo_200, lda_combo_200 = model_eval(model_dbow, 200, model_dmm)
# LR Accuracy: 0.755764411027569 for DBOW + DMM Combined
# CS Accuracy: 0.7382832080200501 for DBOW + DMM Combined
# LDA Accuracy: 0.7541353383458647 for DBOW + DMM Combined

In [None]:
tweet = 'what a beautiful day'
get_sentiment(tweet, model_dbow, 200, lr_combo_200, pos_avg_combo_200, neg_avg_combo_200, lda_combo_200, model_dmm)

## Varying the sizes of the dimensions for Doc2Vec Models

In [None]:
model_name_dbow = 'doc2vecmodel_dbow_200dim.model'
model_name_dmm = 'doc2vecmodel_dmm_200dim.model'
# train_dbow_d2v(model_name_dbow, 200)
# train_dmm_d2v(model_name_dmm, 200)

In [None]:
model_name_dbow = 'doc2vecmodel_dbow_75dim.model'
model_name_dmm = 'doc2vecmodel_dmm_75dim.model'
# train_dbow_d2v(model_name_dbow, 75)
# train_dmm_d2v(model_name_dmm, 75)

In [None]:
model_name_dbow = 'doc2vecmodel_dbow_50dim.model'
model_name_dmm = 'doc2vecmodel_dmm_50dim.model'
# train_dbow_d2v(model_name_dbow, 50)
# train_dmm_d2v(model_name_dmm, 50)

In [None]:
model_name_dbow = 'doc2vecmodel_dbow_25dim.model'
model_name_dmm = 'doc2vecmodel_dmm_25dim.model'
# train_dbow_d2v(model_name_dbow, 25)
# train_dmm_d2v(model_name_dmm, 25)

In [None]:
model_name_dbow = 'doc2vecmodel_dbow_10dim.model'
model_name_dmm = 'doc2vecmodel_dmm_10dim.model'
# train_dbow_d2v(model_name_dbow, 10)
# train_dmm_d2v(model_name_dmm, 10)

In [None]:
model_dbow = Doc2Vec.load('doc2vecmodel_dbow_200dim.model')
print('LOADED DBOW (200 DIM) MODEL\n')
model_dmm = Doc2Vec.load('doc2vecmodel_dmm_200dim.model')
print('LOADED DMM (200 DIM) MODEL\n')

In [None]:
lr_dbow_200, pos_avg_dbow_200, neg_avg_dbow_200, lda_dbow_200 = model_eval(model_dbow, 200)
# LR Accuracy: 0.7471804511278195 for DBOW
# CS Accuracy: 0.7302631578947368 for DBOW
# LDA Accuracy: 0.7478696741854637 for DBOW

In [None]:
lr_dmm_200, pos_avg_dmm_200, neg_avg_dmm_200, lda_dmm_200 = model_eval(model_dmm, 200)
# LR Accuracy: 0.7309523809523809 for DMM
# CS Accuracy: 0.7226817042606516 for DMM
# LDA Accuracy: 0.7293859649122807 for DMM

In [None]:
lr_combo_400, pos_avg_combo_400, neg_avg_combo_400, lda_combo_400 = model_eval(model_dbow, 400, model_dmm)
# LR Accuracy: 0.755764411027569 for DBOW + DMM Combined
# CS Accuracy: 0.7382832080200501 for DBOW + DMM Combined
# LDA Accuracy: [MEM ERROR] for DBOW + DMM Combined

In [None]:
model_dbow = Doc2Vec.load('doc2vecmodel_dbow_75dim.model')
print('LOADED DBOW (75 DIM) MODEL\n')
model_dmm = Doc2Vec.load('doc2vecmodel_dmm_75dim.model')
print('LOADED DMM (75 DIM) MODEL\n')

In [None]:
lr_dbow_75, pos_avg_dbow_75, neg_avg_dbow_75, lda_dbow_75 = model_eval(model_dbow, 75)
# LR Accuracy: 0.7390977443609023 for DBOW
# CS Accuracy: 0.7293233082706767 for DBOW
# LDA Accuracy: 0.7391604010025062 for DBOW

In [None]:
lr_dmm_75, pos_avg_dmm_75, neg_avg_dmm_75, lda_dmm_75 = model_eval(model_dmm, 75)
# LR Accuracy: 0.7294486215538847 for DMM
# CS Accuracy: 0.7206140350877193 for DMM
# LDA Accuracy: 0.7285714285714285 for DMM

In [None]:
lr_combo_150, pos_avg_combo_150, neg_avg_combo_150, lda_combo_150 = model_eval(model_dbow, 150, model_dmm)
# LR Accuracy: 0.750438596491228 for DBOW + DMM Combined
# CS Accuracy: 0.7380952380952381 for DBOW + DMM Combined
# LDA Accuracy: 0.750062656641604 for DBOW + DMM Combined

In [None]:
model_dbow = Doc2Vec.load('doc2vecmodel_dbow_50dim.model')
print('LOADED DBOW (50 DIM) MODEL\n')
model_dmm = Doc2Vec.load('doc2vecmodel_dmm_50dim.model')
print('LOADED DMM (50 DIM) MODEL\n')

In [None]:
lr_dbow_50, pos_avg_dbow_50, neg_avg_dbow_50, lda_dbow_50 = model_eval(model_dbow, 50)
# LR Accuracy: 0.7328947368421053 for DMM
# CS Accuracy: 0.7271929824561404 for DMM
# LDA Accuracy: 0.7326441102756892 for DMM

In [None]:
lr_dmm_50, pos_avg_dmm_50, neg_avg_dmm_50, lda_dmm_50 = model_eval(model_dmm, 50)
# LR Accuracy: 0.7240601503759398 for DMM
# CS Accuracy: 0.718859649122807 for DMM
# LDA Accuracy: 0.7228696741854637 for DMM

In [None]:
lr_combo_100, pos_avg_combo_100, neg_avg_combo_100, lda_combo_100 = model_eval(model_dbow, 100, model_dmm)
# LR Accuracy: 0.7432957393483709 for DBOW + DMM Combined
# CS Accuracy: 0.7293859649122807 for DBOW + DMM Combined
# LDA Accuracy: 0.7422932330827068 for DBOW + DMM Combined

In [None]:
model_dbow = Doc2Vec.load('doc2vecmodel_dbow_25dim.model')
print('LOADED DBOW (25 DIM) MODEL\n')
model_dmm = Doc2Vec.load('doc2vecmodel_dmm_25dim.model')
print('LOADED DMM (25 DIM) MODEL\n')

In [None]:
lr_dbow_25, pos_avg_dbow_25, neg_avg_dbow_25, lda_dbow_25 = model_eval(model_dbow, 25)
# LR Accuracy: 0.7305764411027569 for DBOW
# CS Accuracy: 0.7268170426065163 for DBOW
# LDA Accuracy: 0.7308270676691729 for DBOW

In [None]:
lr_dmm_25, pos_avg_dmm_25, neg_avg_dmm_25, lda_dmm_25 = model_eval(model_dmm, 25)
# LR Accuracy: 0.712593984962406 for DMM
# CS Accuracy: 0.7100250626566416 for DMM
# LDA Accuracy: 0.712218045112782 for DMM

In [None]:
lr_combo_50, pos_avg_combo_50, neg_avg_combo_50, lda_combo_50 = model_eval(model_dbow, 50, model_dmm)
# LR Accuracy: 0.7328947368421053 for DBOW + DMM Combined
# CS Accuracy: 0.7236842105263158 for DBOW + DMM Combined
# LDA Accuracy: 0.7334586466165414 for DBOW + DMM Combined

In [None]:
model_dbow = Doc2Vec.load('doc2vecmodel_dbow_10dim.model')
print('LOADED DBOW (10 DIM) MODEL\n')
model_dmm = Doc2Vec.load('doc2vecmodel_dmm_10dim.model')
print('LOADED DMM (10 DIM) MODEL\n')

In [None]:
lr_dbow_10, pos_avg_dbow_10, neg_avg_dbow_10, lda_dbow_10 = model_eval(model_dbow, 10)
# LR Accuracy: 0.7095864661654135 for DBOW
# CS Accuracy: 0.706390977443609 for DBOW
# LDA Accuracy: 0.7093358395989975 for DBOW

In [None]:
lr_dmm_10, pos_avg_dmm_10, neg_avg_dmm_10, lda_dmm_10 = model_eval(model_dmm, 10)
# LR Accuracy: 0.6843358395989975 for DMM
# CS Accuracy: 0.680576441102757 for DMM
# LDA Accuracy: 0.6849624060150376 for DMM

In [None]:
lr_combo_20, pos_avg_combo_20, neg_avg_combo_20, lda_combo_20 = model_eval(model_dbow, 20, model_dmm)
# LR Accuracy: 0.712155388471178 for DBOW + DMM Combined
# CS Accuracy: 0.6941729323308271 for DBOW + DMM Combined
# LDA Accuracy: 0.7112781954887218 for DBOW + DMM Combined

## Extracting Word Vectors from Doc2Vec Model and using Average Word Vector (100-DIM DBOW)

In [None]:
train_vec, val_vec, test_vec = extract_wordvec(model_dbow, 100)

In [None]:
lr = LogisticRegression()
lr.fit(train_vec, y_train)
print('Logistic Regression Score: ')
print(lr.score(val_vec, y_validation))
print('-----------------------------------')
# Accuracy: 0.618859649122807 for DBOW
# Accuracy: 0.7165413533834587 for DMM

In [None]:
posavg, negavg = average_vectors(train_vec, y_train, 100)
cos_similarity = cosine_sim(posavg, negavg, val_vec, y_validation, 100)
print('Cosine Similarity Score: ')
print(cos_similarity)
print('-----------------------------------')
# Accuracy: 0.6110275689223058 for DBOW
# Accuracy: 0.6630325814536341 for DMM

In [None]:
lda = LinearDiscriminantAnalysis(n_components=1)
lda.fit(train_vec, y_train)
print('Linear Discriminant Analysis Score: ')
print(lda.score(val_vec, y_validation))
print('-----------------------------------')
# Accuracy: 0.6298872180451128 for DBOW
# Accuracy: 0.7156641604010026 for DMM

## Neural Network Experiments

In [None]:
keras.backend.backend()

#### Multi-layer Perceptron (Combo Doc Vectors, 256 nodes, 3 layers)

In [None]:
train_vec_combo = model_combine(model_dbow, model_dmm, x_train, 200)
val_vec_combo = model_combine(model_dbow, model_dmm, x_validation, 200)
test_vec_combo = model_combine(model_dbow, model_dmm, x_test, 200)

In [None]:
model_file_name = "combo_model_multi_layer_perceptron_256_3.hd5"
checkpoint = ModelCheckpoint(model_file_name, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
early_stop = EarlyStopping(monitor='val_accuracy', patience=5, mode='max') 
callback_arr = [checkpoint, early_stop]

In [None]:
np.random.seed(2000)
nn_model_combo_256_3 = Sequential()
nn_model_combo_256_3.add(Dense(256, activation='relu', input_dim=200))
nn_model_combo_256_3.add(Dense(256, activation='relu'))
nn_model_combo_256_3.add(Dense(256, activation='relu'))
nn_model_combo_256_3.add(Dense(1, activation='sigmoid'))
nn_model_combo_256_3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model_combo_256_3.fit(train_vec_combo, y_train, validation_data=(val_vec_combo, y_validation), epochs=50, batch_size=32, verbose=1, callbacks=callback_arr)

In [None]:
nn_model_combo_256_3 = load_model('combo_model_multi_layer_perceptron_256_3.hd5')
nn_model_combo_256_3.evaluate(x=val_vec_combo, y=y_validation)
# VALIDATION ACCURACY: 0.79605 with es 5 and 50 epochs (val acc = 0.7960526347160339, and val loss = 0.4394365343085507) at epoch 7


#### Multi-layer Perceptron (DBOW Doc Vectors, 256 nodes, 3 layers)

In [None]:
train_vec = model_to_vec(model_dbow, x_train, 100)
val_vec = model_to_vec(model_dbow, x_validation, 100)
test_vec = model_to_vec(model_dbow, x_test, 100)

In [None]:
model_file_name = "dbow_model_multi_layer_perceptron_256_3.hd5"
checkpoint = ModelCheckpoint(model_file_name, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
early_stop = EarlyStopping(monitor='val_accuracy', patience=5, mode='max') 
callback_arr = [checkpoint, early_stop]

In [None]:
np.random.seed(2000)
nn_model_dbow_256_3 = Sequential()
nn_model_dbow_256_3.add(Dense(256, activation='relu', input_dim=100))
nn_model_dbow_256_3.add(Dense(256, activation='relu'))
nn_model_dbow_256_3.add(Dense(256, activation='relu'))
nn_model_dbow_256_3.add(Dense(1, activation='sigmoid'))
nn_model_dbow_256_3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model_dbow_256_3.fit(train_vec, y_train, validation_data=(val_vec, y_validation), epochs=50, batch_size=32, verbose=1, callbacks=callback_arr)

In [None]:
nn_model_dbow_256_3 = load_model('dbow_model_multi_layer_perceptron_256_3.hd5')
nn_model_dbow_256_3.evaluate(x=val_vec, y=y_validation)
# VALIDATION ACCURACY: 0.78659 with es 5 and 50 epochs (val acc = 0.7865914702415466, and val loss = [0.45259528155613665) at epoch 7


#### Multi-layer Perceptron (DMM Doc Vectors, 256 nodes, 3 layers)

In [None]:
train_vec = model_to_vec(model_dmm, x_train, 100)
val_vec = model_to_vec(model_dmm, x_validation, 100)
test_vec = model_to_vec(model_dmm, x_test, 100)

In [None]:
model_file_name = "dmm_model_multi_layer_perceptron_256_3.hd5"
checkpoint = ModelCheckpoint(model_file_name, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
early_stop = EarlyStopping(monitor='val_accuracy', patience=5, mode='max') 
callback_arr = [checkpoint, early_stop]

In [None]:
np.random.seed(2000)
nn_model_dmm_256_3 = Sequential()
nn_model_dmm_256_3.add(Dense(256, activation='relu', input_dim=100))
nn_model_dmm_256_3.add(Dense(256, activation='relu'))
nn_model_dmm_256_3.add(Dense(256, activation='relu'))
nn_model_dmm_256_3.add(Dense(1, activation='sigmoid'))
nn_model_dmm_256_3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model_dmm_256_3.fit(train_vec, y_train, validation_data=(val_vec, y_validation), epochs=50, batch_size=32, verbose=1, callbacks=callback_arr)

In [None]:
nn_model_dmm_256_3 = load_model('dmm_model_multi_layer_perceptron_256_3.hd5')
nn_model_dmm_256_3.evaluate(x=val_vec, y=y_validation)
# VALIDATION ACCURACY: 0.76510 with es 5 and 50 epochs (val acc = 0.7651002407073975, and val loss = 0.4874301516770719) at epoch 9


#### Multi-layer Perceptron (Comb Doc Vectors, 256 nodes, 1 layer)

In [None]:
train_vec_combo = model_combine(model_dbow, model_dmm, x_train, 200)
val_vec_combo = model_combine(model_dbow, model_dmm, x_validation, 200)
test_vec_combo = model_combine(model_dbow, model_dmm, x_test, 200)

In [None]:
model_file_name = "combo_model_multi_layer_perceptron_256_1.hd5"
checkpoint = ModelCheckpoint(model_file_name, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
early_stop = EarlyStopping(monitor='val_accuracy', patience=5, mode='max') 
callback_arr = [checkpoint, early_stop]

In [None]:
np.random.seed(2000)
nn_model_combo_256_1 = Sequential()
nn_model_combo_256_1.add(Dense(256, activation='relu', input_dim=200))
nn_model_combo_256_1.add(Dense(1, activation='sigmoid'))
nn_model_combo_256_1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model_combo_256_1.fit(train_vec_combo, y_train, validation_data=(val_vec_combo, y_validation), epochs=50, batch_size=32, verbose=1, callbacks=callback_arr)

In [None]:
nn_model_combo_256_1 = load_model('combo_model_multi_layer_perceptron_256_1.hd5')
nn_model_combo_256_1.evaluate(x=val_vec_combo, y=y_validation)
# VALIDATION ACCURACY: 0.78465 with es 5 and 50 epochs (val acc = 0.784649133682251, and val loss = 0.45730126285015193) at epoch 6


#### Multi-layer Perceptron (DBOW Doc Vectors, 256 nodes, 1 layer)

In [None]:
train_vec = model_to_vec(model_dbow, x_train, 100)
val_vec = model_to_vec(model_dbow, x_validation, 100)
test_vec = model_to_vec(model_dbow, x_test, 100)

In [None]:
model_file_name = "dbow_model_multi_layer_perceptron_256_1.hd5"
checkpoint = ModelCheckpoint(model_file_name, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
early_stop = EarlyStopping(monitor='val_accuracy', patience=5, mode='max') 
callback_arr = [checkpoint, early_stop]

In [None]:
np.random.seed(2000)
nn_model_dbow_256_1 = Sequential()
nn_model_dbow_256_1.add(Dense(256, activation='relu', input_dim=100))
nn_model_dbow_256_1.add(Dense(1, activation='sigmoid'))
nn_model_dbow_256_1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model_dbow_256_1.fit(train_vec, y_train, validation_data=(val_vec, y_validation), epochs=50, batch_size=32, verbose=1, callbacks=callback_arr)

In [None]:
nn_model_dbow_256_1 = load_model('dbow_model_multi_layer_perceptron_256_1.hd5')
nn_model_dbow_256_1.evaluate(x=val_vec, y=y_validation)
# VALIDATION ACCURACY: 0.77406 with es 5 and 50 epochs (val acc = 0.7740601301193237, and val loss = 0.4699582203289022) at epoch 4


#### Multi-layer Perceptron (DMM Doc Vectors, 256 nodes, 1 layer)

In [None]:
train_vec = model_to_vec(model_dmm, x_train, 100)
val_vec = model_to_vec(model_dmm, x_validation, 100)
test_vec = model_to_vec(model_dmm, x_test, 100)

In [None]:
model_file_name = "dmm_model_multi_layer_perceptron_256_1.hd5"
checkpoint = ModelCheckpoint(model_file_name, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
early_stop = EarlyStopping(monitor='val_accuracy', patience=5, mode='max') 
callback_arr = [checkpoint, early_stop]

In [None]:
np.random.seed(2000)
nn_model_dmm_256_1 = Sequential()
nn_model_dmm_256_1.add(Dense(256, activation='relu', input_dim=100))
nn_model_dmm_256_1.add(Dense(1, activation='sigmoid'))
nn_model_dmm_256_1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model_dmm_256_1.fit(train_vec, y_train, validation_data=(val_vec, y_validation), epochs=50, batch_size=32, verbose=1, callbacks=callback_arr)

In [None]:
nn_model_dmm_256_1 = load_model('dmm_model_multi_layer_perceptron_256_1.hd5')
nn_model_dmm_256_1.evaluate(x=val_vec, y=y_validation)
# VALIDATION ACCURACY: 0.75150 with es 5 and 50 epochs (val acc = 0.7515037655830383, and val loss = 0.5119869738593137) at epoch 12


#### Multi-layer Perceptron (Combo Doc Vectors, 128 nodes, 3 layers)

In [None]:
train_vec_combo = model_combine(model_dbow, model_dmm, x_train, 200)
val_vec_combo = model_combine(model_dbow, model_dmm, x_validation, 200)
test_vec_combo = model_combine(model_dbow, model_dmm, x_test, 200)

In [None]:
model_file_name = "combo_model_multi_layer_perceptron_128_3.hd5"
checkpoint = ModelCheckpoint(model_file_name, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
early_stop = EarlyStopping(monitor='val_accuracy', patience=5, mode='max') 
callback_arr = [checkpoint, early_stop]

In [None]:
np.random.seed(2000)
nn_model_combo_128_3 = Sequential()
nn_model_combo_128_3.add(Dense(128, activation='relu', input_dim=200))
nn_model_combo_128_3.add(Dense(128, activation='relu'))
nn_model_combo_128_3.add(Dense(128, activation='relu'))
nn_model_combo_128_3.add(Dense(1, activation='sigmoid'))
nn_model_combo_128_3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model_combo_128_3.fit(train_vec_combo, y_train, validation_data=(val_vec_combo, y_validation), epochs=50, batch_size=32, verbose=1, callbacks=callback_arr)

In [None]:
nn_model_combo_128_3 = load_model('combo_model_multi_layer_perceptron_128_3.hd5')
nn_model_combo_128_3.evaluate(x=val_vec_combo, y=y_validation)
# VALIDATION ACCURACY: 0.79192 with es 5 and 50 epochs (val acc = 0.7919172644615173, and val loss = 0.44302879301527687) at epoch 8


#### Multi-layer Perceptron (DBOW Doc Vectors, 128 nodes, 3 layers)

In [None]:
train_vec = model_to_vec(model_dbow, x_train, 100)
val_vec = model_to_vec(model_dbow, x_validation, 100)
test_vec = model_to_vec(model_dbow, x_test, 100)

In [None]:
model_file_name = "dbow_model_multi_layer_perceptron_128_3.hd5"
checkpoint = ModelCheckpoint(model_file_name, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
early_stop = EarlyStopping(monitor='val_accuracy', patience=5, mode='max') 
callback_arr = [checkpoint, early_stop]

In [None]:
np.random.seed(2000)
nn_model_dbow_128_3 = Sequential()
nn_model_dbow_128_3.add(Dense(128, activation='relu', input_dim=100))
nn_model_dbow_128_3.add(Dense(128, activation='relu'))
nn_model_dbow_128_3.add(Dense(128, activation='relu'))
nn_model_dbow_128_3.add(Dense(1, activation='sigmoid'))
nn_model_dbow_128_3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model_dbow_128_3.fit(train_vec, y_train, validation_data=(val_vec, y_validation), epochs=50, batch_size=32, verbose=1, callbacks=callback_arr)

In [None]:
nn_model_dbow_128_3 = load_model('dbow_model_multi_layer_perceptron_128_3.hd5')
nn_model_dbow_128_3.evaluate(x=val_vec, y=y_validation)
# VALIDATION ACCURACY: 0.78289 with es 5 and 50 epochs (val acc = 0.7828947305679321, and val loss = 0.45956595229325736) at epoch 18


#### Multi-layer Perceptron (DMM Doc Vectors, 128 nodes, 3 layers)

In [None]:
train_vec = model_to_vec(model_dmm, x_train, 100)
val_vec = model_to_vec(model_dmm, x_validation, 100)
test_vec = model_to_vec(model_dmm, x_test, 100)

In [None]:
model_file_name = "dmm_model_multi_layer_perceptron_128_3.hd5"
checkpoint = ModelCheckpoint(model_file_name, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
early_stop = EarlyStopping(monitor='val_accuracy', patience=5, mode='max') 
callback_arr = [checkpoint, early_stop]

In [None]:
np.random.seed(2000)
nn_model_dmm_128_3 = Sequential()
nn_model_dmm_128_3.add(Dense(128, activation='relu', input_dim=100))
nn_model_dmm_128_3.add(Dense(128, activation='relu'))
nn_model_dmm_128_3.add(Dense(128, activation='relu'))
nn_model_dmm_128_3.add(Dense(1, activation='sigmoid'))
nn_model_dmm_128_3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model_dmm_128_3.fit(train_vec, y_train, validation_data=(val_vec, y_validation), epochs=50, batch_size=32, verbose=1, callbacks=callback_arr)

In [None]:
nn_model_dmm_128_3 = load_model('dmm_model_multi_layer_perceptron_128_3.hd5')
nn_model_dmm_128_3.evaluate(x=val_vec, y=y_validation)
# VALIDATION ACCURACY: 0.76190 with es 5 and 50 epochs (val acc = 0.761904776096344, and val loss = 0.49163153473297155) at epoch 14


 #### Multi-layer Perceptron (Combo Doc Vectors, 128 nodes, 1 layer)

In [None]:
train_vec_combo = model_combine(model_dbow, model_dmm, x_train, 200)
val_vec_combo = model_combine(model_dbow, model_dmm, x_validation, 200)
test_vec_combo = model_combine(model_dbow, model_dmm, x_test, 200)

In [None]:
model_file_name = "combo_model_multi_layer_perceptron_128_1.hd5"
checkpoint = ModelCheckpoint(model_file_name, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
early_stop = EarlyStopping(monitor='val_accuracy', patience=5, mode='max') 
callback_arr = [checkpoint, early_stop]

In [None]:
np.random.seed(2000)
nn_model_combo_128_1 = Sequential()
nn_model_combo_128_1.add(Dense(128, activation='relu', input_dim=200))
nn_model_combo_128_1.add(Dense(1, activation='sigmoid'))
nn_model_combo_128_1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model_combo_128_1.fit(train_vec_combo, y_train, validation_data=(val_vec_combo, y_validation), epochs=50, batch_size=32, verbose=1, callbacks=callback_arr)

In [None]:
nn_model_combo_128_1 = load_model('combo_model_multi_layer_perceptron_128_1.hd5')
nn_model_combo_128_1.evaluate(x=val_vec_combo, y=y_validation)
# VALIDATION ACCURACY: 0.78296 with es 5 and 50 epochs (val acc = 0.7829573750495911, and val loss = 0.4597198429711182) at epoch 4


 #### Multi-layer Perceptron (DBOW Doc Vectors, 128 nodes, 1 layer)

In [None]:
train_vec = model_to_vec(model_dbow, x_train, 100)
val_vec = model_to_vec(model_dbow, x_validation, 100)
test_vec = model_to_vec(model_dbow, x_test, 100)

In [None]:
model_file_name = "dbow_model_multi_layer_perceptron_128_1.hd5"
checkpoint = ModelCheckpoint(model_file_name, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
early_stop = EarlyStopping(monitor='val_accuracy', patience=5, mode='max') 
callback_arr = [checkpoint, early_stop]

In [None]:
np.random.seed(2000)
nn_model_dbow_128_1 = Sequential()
nn_model_dbow_128_1.add(Dense(128, activation='relu', input_dim=100))
nn_model_dbow_128_1.add(Dense(1, activation='sigmoid'))
nn_model_dbow_128_1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model_dbow_128_1.fit(train_vec, y_train, validation_data=(val_vec, y_validation), epochs=50, batch_size=32, verbose=1, callbacks=callback_arr)

In [None]:
nn_model_dbow_128_1 = load_model('dbow_model_multi_layer_perceptron_128_1.hd5')
nn_model_dbow_128_1.evaluate(x=val_vec, y=y_validation)
# VALIDATION ACCURACY: 0.77368 with es 5 and 50 epochs (val acc = 0.7736842036247253, and val loss = 0.47306825482755677) at epoch 6


 #### Multi-layer Perceptron (DMM Doc Vectors, 128 nodes, 1 layer)

In [None]:
train_vec = model_to_vec(model_dmm, x_train, 100)
val_vec = model_to_vec(model_dmm, x_validation, 100)
test_vec = model_to_vec(model_dmm, x_test, 100)

In [None]:
model_file_name = "dmm_model_multi_layer_perceptron_128_1.hd5"
checkpoint = ModelCheckpoint(model_file_name, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
early_stop = EarlyStopping(monitor='val_accuracy', patience=5, mode='max') 
callback_arr = [checkpoint, early_stop]

In [None]:
np.random.seed(2000)
nn_model_dmm_128_1 = Sequential()
nn_model_dmm_128_1.add(Dense(128, activation='relu', input_dim=100))
nn_model_dmm_128_1.add(Dense(1, activation='sigmoid'))
nn_model_dmm_128_1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model_dmm_128_1.fit(train_vec, y_train, validation_data=(val_vec, y_validation), epochs=50, batch_size=32, verbose=1, callbacks=callback_arr)

In [None]:
nn_model_dmm_128_1 = load_model('dmm_model_multi_layer_perceptron_128_1.hd5')
nn_model_dmm_128_1.evaluate(x=val_vec, y=y_validation)
# VALIDATION ACCURACY: 0.75144 with es 5 and 50 epochs (val acc = 0.7514411211013794, and val loss = 0.5119231587812715) at epoch 7


In [None]:
# create_tweet_samples(clean_df, 1000)
# create_tweet_samples(clean_df, 10000)
# create_tweet_samples(clean_df, 100000)
# create_tweet_samples(clean_df, 250000)
# create_tweet_samples(clean_df, 500000)
# create_tweet_samples(clean_df, 1000000)

## Experiment 1 (1000 Tweets)

In [None]:
model_name_dbow = 'doc2vecmodel_dbow_1000.model'
model_name_dmm = 'doc2vecmodel_dmm_1000.model'
df_1000 = pd.read_csv("tweets_sample_1000.csv", encoding='latin-1', names=cats)

In [None]:
load_tweets(df_1000)
x = df_1000.tweet
y = df_1000.tag
x_train, x_remain, y_train, y_remain = train_test_split(x, y, test_size=.02, random_state=2000)
x_validation, x_test, y_validation, y_test = train_test_split(x_remain, y_remain, test_size=.5, random_state=2000)

In [None]:
make_tagged_docs(df_1000)

In [None]:
# train_dbow_d2v(model_name_dbow, 100)

In [None]:
# train_dmm_d2v(model_name_dmm, 100) --REMEMBER TO CHANGE FOR ALL!

In [None]:
model_dbow = Doc2Vec.load(model_name_dbow)
print('LOADED DBOW (1000) MODEL\n')
model_dmm = Doc2Vec.load(model_name_dmm)
print('LOADED DMM (1000) MODEL\n')

In [None]:
train_vec = model_to_vec(model_dbow, x_train)
val_vec = model_to_vec(model_dbow, x_validation)
test_vec = model_to_vec(model_dbow, x_test)

In [None]:
train_vec = model_to_vec(model_dmm, x_train)
val_vec = model_to_vec(model_dmm, x_validation)
test_vec = model_to_vec(model_dmm, x_test)

In [None]:
lr = LogisticRegression()
lr.fit(train_vec, y_train)
print('Logistic Regression Score: ')
print(lr.score(val_vec, y_validation))
print('-----------------------------------')
# Accuracy: 0.6 for DBOW
# Accuracy: 0.5 for DMM

In [None]:
posavg, negavg = average_vectors(train_vec, y_train, 100)
cos_similarity = cosine_sim(posavg, negavg, val_vec, y_validation, 100)
print('Cosine Similarity Score: ')
print(cos_similarity)
print('-----------------------------------')
# Accuracy: 0.4 for DBOW
# Accuracy: 0.6 for DMM

In [None]:
lda = LinearDiscriminantAnalysis(n_components=1)
lda.fit(train_vec, y_train)
print('Linear Discriminant Analysis Score: ')
print(lda.score(val_vec, y_validation))
print('-----------------------------------')
# Accuracy: 0.7 for DBOW
# Accuracy: 0.4 for DMM

In [None]:
svm = SVC(kernel='rbf')
svm.fit(train_vec, y_train)
print('SVM Score: ')
print(svm.score(val_vec, y_validation))
print('-----------------------------------')
# Accuracy: 0.6 for DBOW
# Accuracy: 0.4 for DMM

In [None]:
train_vec_combo = model_combine(model_dbow, model_dmm, x_train)
val_vec_combo = model_combine(model_dbow, model_dmm, x_validation)
test_vec_combo = model_combine(model_dbow, model_dmm, x_test)

In [None]:
lr = LogisticRegression()
lr.fit(train_vec_combo, y_train)
print('Logistic Regression Score: ')
print(lr.score(val_vec_combo, y_validation))
print('-----------------------------------')
# Accuracy: 0.5 for DBOW + DMM Combined

In [None]:
posavg, negavg = average_vectors(train_vec_combo, y_train, 200)
cos_similarity = cosine_sim(posavg, negavg, val_vec_combo, y_validation, 200)
print('Cosine Similarity Score: ')
print(cos_similarity)
print('-----------------------------------')
# Accuracy: 0.5 for DBOW + DMM Combined

In [None]:
lda = LinearDiscriminantAnalysis(n_components=1)
lda.fit(train_vec_combo, y_train)
print('Linear Discriminant Analysis Score: ')
print(lda.score(val_vec_combo, y_validation))
print('-----------------------------------')
# Accuracy: 0.4 for DBOW + DMM Combined

In [None]:
svm = SVC(kernel='linear')
svm.fit(train_vec_combo, y_train)
print('SVM Score: ')
print(svm.score(val_vec_combo, y_validation))
print('-----------------------------------')
# Accuracy: 0.6 for DBOW + DMM Combined

In [None]:
def obj(trial):
    svc_c = trial.suggest_loguniform('C', 1e0, 1e2)
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf'])
    svm = SVC(C=svc_c, kernel=kernel)
    svm.fit(train_vec_combo, y_train)
    score = cross_val_score(svm, train_vec_combo, y_train, n_jobs=-1, cv=3)
    accuracy = score.mean()
    return 1.0 - accuracy

In [None]:
study = optuna.create_study()

In [None]:
study.optimize(obj, n_trials=20)

In [None]:
svm = SVC(C=6.336527825194087, kernel='poly')
svm.fit(train_vec_combo, y_train)
print('SVM Score: ')
print(svm.score(val_vec_combo, y_validation))
print('-----------------------------------')
# Accuracy: 0.7 for DBOW + DMM Combined

## Experiment 2 (10000 Tweets)

In [None]:
model_name_dbow = 'doc2vecmodel_dbow_10000.model'
model_name_dmm = 'doc2vecmodel_dmm_10000.model'
df_10000 = pd.read_csv("tweets_sample_10000.csv", encoding='latin-1', names=cats)

In [None]:
load_tweets(df_10000)
x = df_10000.tweet
y = df_10000.tag
x_train, x_remain, y_train, y_remain = train_test_split(x, y, test_size=.02, random_state=2000)
x_validation, x_test, y_validation, y_test = train_test_split(x_remain, y_remain, test_size=.5, random_state=2000)

In [None]:
# make_tagged_docs(df_10000)

In [None]:
# train_dbow_d2v(model_name_dbow, 100)

In [None]:
# train_dmm_d2v(model_name_dmm, 100)

In [None]:
model_dbow = Doc2Vec.load(model_name_dbow)
print('LOADED DBOW (10000) MODEL\n')
model_dmm = Doc2Vec.load(model_name_dmm)
print('LOADED DMM (10000) MODEL\n')

In [None]:
train_vec = model_to_vec(model_dbow, x_train)
val_vec = model_to_vec(model_dbow, x_validation)
test_vec = model_to_vec(model_dbow, x_test)

In [None]:
train_vec = model_to_vec(model_dmm, x_train)
val_vec = model_to_vec(model_dmm, x_validation)
test_vec = model_to_vec(model_dmm, x_test)

In [None]:
lr = LogisticRegression()
lr.fit(train_vec, y_train)
print('Logistic Regression Score: ')
print(lr.score(val_vec, y_validation))
print('-----------------------------------')
# Accuracy: 0.51 for DBOW
# Accuracy: 0.53 for DMM

In [None]:
posavg, negavg = average_vectors(train_vec, y_train, 100)
cos_similarity = cosine_sim(posavg, negavg, val_vec, y_validation, 100)
print('Cosine Similarity Score: ')
print(cos_similarity)
print('-----------------------------------')
# Accuracy: 0.52 for DBOW
# Accuracy: 0.49 for DMM

In [None]:
lda = LinearDiscriminantAnalysis(n_components=1)
lda.fit(train_vec, y_train)
print('Linear Discriminant Analysis Score: ')
print(lda.score(val_vec, y_validation))
print('-----------------------------------')
# Accuracy: 0.5 for DBOW
# Accuracy: 0.54 for DMM

In [None]:
svm = SVC(kernel='rbf')
svm.fit(train_vec, y_train)
print('SVM Score: ')
print(svm.score(val_vec, y_validation))
print('-----------------------------------')
# Accuracy: 0.58 for DBOW
# Accuracy: 0.55 for DMM

In [None]:
train_vec_combo = model_combine(model_dbow, model_dmm, x_train)
val_vec_combo = model_combine(model_dbow, model_dmm, x_validation)
test_vec_combo = model_combine(model_dbow, model_dmm, x_test)

In [None]:
lr = LogisticRegression()
lr.fit(train_vec_combo, y_train)
print('Logistic Regression Score: ')
print(lr.score(val_vec_combo, y_validation))
print('-----------------------------------')
# Accuracy: 0.46 for DBOW + DMM Combined

In [None]:
posavg, negavg = average_vectors(train_vec_combo, y_train, 200)
cos_similarity = cosine_sim(posavg, negavg, val_vec_combo, y_validation, 200)
print('Cosine Similarity Score: ')
print(cos_similarity)
print('-----------------------------------')
# Accuracy: 0.5 for DBOW + DMM Combined

In [None]:
lda = LinearDiscriminantAnalysis(n_components=1)
lda.fit(train_vec_combo, y_train)
print('Linear Discriminant Analysis Score: ')
print(lda.score(val_vec_combo, y_validation))
print('-----------------------------------')
# Accuracy: 0.45 for DBOW + DMM Combined

In [None]:
svm = SVC(kernel='linear')
svm.fit(train_vec_combo, y_train)
print('SVM Score: ')
print(svm.score(val_vec_combo, y_validation))
print('-----------------------------------')
# Accuracy: 0.5 for DBOW + DMM Combined

In [None]:
def obj(trial):
    svc_c = trial.suggest_loguniform('C', 1e0, 1e2)
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf'])
    svm = SVC(C=svc_c, kernel=kernel)
    svm.fit(train_vec_combo, y_train)
    score = cross_val_score(svm, train_vec_combo, y_train, n_jobs=-1, cv=3)
    accuracy = score.mean()
    return 1.0 - accuracy

In [None]:
study = optuna.create_study()

In [None]:
study.optimize(obj, n_trials=20)

In [None]:
svm = SVC(C=6.336527825194087, kernel='poly')
svm.fit(train_vec_combo, y_train)
print('SVM Score: ')
print(svm.score(val_vec_combo, y_validation))
print('-----------------------------------')
# Accuracy: 0.57 for DBOW + DMM Combined

## Experiment 3 (100000 Tweets)

In [None]:
model_name_dbow = 'doc2vecmodel_dbow_100000.model'
model_name_dmm = 'doc2vecmodel_dmm_100000.model'
df_100000 = pd.read_csv("tweets_sample_100000.csv", encoding='latin-1', names=cats)

In [None]:
load_tweets(df_100000)
x = df_100000.tweet
y = df_100000.tag
x_train, x_remain, y_train, y_remain = train_test_split(x, y, test_size=.02, random_state=2000)
x_validation, x_test, y_validation, y_test = train_test_split(x_remain, y_remain, test_size=.5, random_state=2000)

In [None]:
# make_tagged_docs(df_100000)

In [None]:
# train_dbow_d2v(model_name_dbow, 100)

In [None]:
# train_dmm_d2v(model_name_dmm, 100)

In [None]:
model_dbow = Doc2Vec.load(model_name_dbow)
print('LOADED DBOW (100000) MODEL\n')
model_dmm = Doc2Vec.load(model_name_dmm)
print('LOADED DMM (100000) MODEL\n')

In [None]:
train_vec = model_to_vec(model_dbow, x_train)
val_vec = model_to_vec(model_dbow, x_validation)
test_vec = model_to_vec(model_dbow, x_test)

In [None]:
train_vec = model_to_vec(model_dmm, x_train)
val_vec = model_to_vec(model_dmm, x_validation)
test_vec = model_to_vec(model_dmm, x_test)

In [None]:
lr = LogisticRegression()
lr.fit(train_vec, y_train)
print('Logistic Regression Score: ')
print(lr.score(val_vec, y_validation))
print('-----------------------------------')
# Accuracy: 0.521 for DBOW
# Accuracy: 0.516 for DMM

In [None]:
posavg, negavg = average_vectors(train_vec, y_train, 100)
cos_similarity = cosine_sim(posavg, negavg, val_vec, y_validation, 100)
print('Cosine Similarity Score: ')
print(cos_similarity)
print('-----------------------------------')
# Accuracy: 0.522 for DBOW
# Accuracy: 0.512 for DMM

In [None]:
lda = LinearDiscriminantAnalysis(n_components=1)
lda.fit(train_vec, y_train)
print('Linear Discriminant Analysis Score: ')
print(lda.score(val_vec, y_validation))
print('-----------------------------------')
# Accuracy: 0.52 for DBOW
# Accuracy: 0.517 for DMM

In [None]:
train_vec_combo = model_combine(model_dbow, model_dmm, x_train)
val_vec_combo = model_combine(model_dbow, model_dmm, x_validation)
test_vec_combo = model_combine(model_dbow, model_dmm, x_test)

In [None]:
lr = LogisticRegression()
lr.fit(train_vec_combo, y_train)
print('Logistic Regression Score: ')
print(lr.score(val_vec_combo, y_validation))
print('-----------------------------------')
# Accuracy: 0.519 for DBOW + DMM Combined

In [None]:
posavg, negavg = average_vectors(train_vec_combo, y_train, 200)
cos_similarity = cosine_sim(posavg, negavg, val_vec_combo, y_validation, 200)
print('Cosine Similarity Score: ')
print(cos_similarity)
print('-----------------------------------')
# Accuracy: 0.513 for DBOW + DMM Combined

In [None]:
lda = LinearDiscriminantAnalysis(n_components=1)
lda.fit(train_vec_combo, y_train)
print('Linear Discriminant Analysis Score: ')
print(lda.score(val_vec_combo, y_validation))
print('-----------------------------------')
# Accuracy: 0.521 for DBOW + DMM Combined

## Experiment 4 (250000 Tweets)

In [None]:
model_name_dbow = 'doc2vecmodel_dbow_250000.model'
model_name_dmm = 'doc2vecmodel_dmm_250000.model'
df_250000 = pd.read_csv("tweets_sample_250000.csv", encoding='latin-1', names=cats)

In [None]:
load_tweets(df_250000)
x = df_250000.tweet
y = df_250000.tag
x_train, x_remain, y_train, y_remain = train_test_split(x, y, test_size=.02, random_state=2000)
x_validation, x_test, y_validation, y_test = train_test_split(x_remain, y_remain, test_size=.5, random_state=2000)

In [None]:
# make_tagged_docs(df_250000)

In [None]:
# train_dbow_d2v(model_name_dbow, 100)

In [None]:
# train_dmm_d2v(model_name_dmm, 100)

In [None]:
model_dbow = Doc2Vec.load(model_name_dbow)
print('LOADED DBOW (250000) MODEL\n')
model_dmm = Doc2Vec.load(model_name_dmm)
print('LOADED DMM (250000) MODEL\n')

In [None]:
train_vec = model_to_vec(model_dbow, x_train)
val_vec = model_to_vec(model_dbow, x_validation)
test_vec = model_to_vec(model_dbow, x_test)

In [None]:
train_vec = model_to_vec(model_dmm, x_train)
val_vec = model_to_vec(model_dmm, x_validation)
test_vec = model_to_vec(model_dmm, x_test)

In [None]:
lr = LogisticRegression()
lr.fit(train_vec, y_train)
print('Logistic Regression Score: ')
print(lr.score(val_vec, y_validation))
print('-----------------------------------')
# Accuracy: 0.5408 for DBOW
# Accuracy: 0.5216 for DMM

In [None]:
posavg, negavg = average_vectors(train_vec, y_train, 100)
cos_similarity = cosine_sim(posavg, negavg, val_vec, y_validation, 100)
print('Cosine Similarity Score: ')
print(cos_similarity)
print('-----------------------------------')
# Accuracy: 0.5396 for DBOW
# Accuracy: 0.522 for DMM

In [None]:
lda = LinearDiscriminantAnalysis(n_components=1)
lda.fit(train_vec, y_train)
print('Linear Discriminant Analysis Score: ')
print(lda.score(val_vec, y_validation))
print('-----------------------------------')
# Accuracy: 0.5412 for DBOW
# Accuracy: 0.5212 for DMM

In [None]:
train_vec_combo = model_combine(model_dbow, model_dmm, x_train)
val_vec_combo = model_combine(model_dbow, model_dmm, x_validation)
test_vec_combo = model_combine(model_dbow, model_dmm, x_test)

In [None]:
lr = LogisticRegression()
lr.fit(train_vec_combo, y_train)
print('Logistic Regression Score: ')
print(lr.score(val_vec_combo, y_validation))
print('-----------------------------------')
# Accuracy: 0.5432 for DBOW + DMM Combined

In [None]:
posavg, negavg = average_vectors(train_vec_combo, y_train, 200)
cos_similarity = cosine_sim(posavg, negavg, val_vec_combo, y_validation, 200)
print('Cosine Similarity Score: ')
print(cos_similarity)
print('-----------------------------------')
# Accuracy: 0.5276 for DBOW + DMM Combined

In [None]:
lda = LinearDiscriminantAnalysis(n_components=1)
lda.fit(train_vec_combo, y_train)
print('Linear Discriminant Analysis Score: ')
print(lda.score(val_vec_combo, y_validation))
print('-----------------------------------')
# Accuracy: 0.5432 for DBOW + DMM Combined

## Experiment 5 (500000 Tweets)

In [None]:
model_name_dbow = 'doc2vecmodel_dbow_500000.model'
model_name_dmm = 'doc2vecmodel_dmm_500000.model'
df_500000 = pd.read_csv("tweets_sample_500000.csv", encoding='latin-1', names=cats)

In [None]:
load_tweets(df_500000)
x = df_500000.tweet
y = df_500000.tag
x_train, x_remain, y_train, y_remain = train_test_split(x, y, test_size=.02, random_state=2000)
x_validation, x_test, y_validation, y_test = train_test_split(x_remain, y_remain, test_size=.5, random_state=2000)

In [None]:
# make_tagged_docs(df_500000)

In [None]:
# train_dbow_d2v(model_name_dbow, 100)

In [None]:
# train_dmm_d2v(model_name_dmm, 100)

In [None]:
model_dbow = Doc2Vec.load(model_name_dbow)
print('LOADED DBOW (500000) MODEL\n')
model_dmm = Doc2Vec.load(model_name_dmm)
print('LOADED DMM (500000) MODEL\n')

In [None]:
train_vec = model_to_vec(model_dbow, x_train)
val_vec = model_to_vec(model_dbow, x_validation)
test_vec = model_to_vec(model_dbow, x_test)

In [None]:
train_vec = model_to_vec(model_dmm, x_train)
val_vec = model_to_vec(model_dmm, x_validation)
test_vec = model_to_vec(model_dmm, x_test)

In [None]:
lr = LogisticRegression()
lr.fit(train_vec, y_train)
print('Logistic Regression Score: ')
print(lr.score(val_vec, y_validation))
print('-----------------------------------')
# Accuracy: 0.5184 for DBOW
# Accuracy: 0.5088 for DMM

In [None]:
posavg, negavg = average_vectors(train_vec, y_train, 100)
cos_similarity = cosine_sim(posavg, negavg, val_vec, y_validation, 100)
print('Cosine Similarity Score: ')
print(cos_similarity)
print('-----------------------------------')
# Accuracy: 0.5164 for DBOW
# Accuracy: 0.514 for DMM

In [None]:
lda = LinearDiscriminantAnalysis(n_components=1)
lda.fit(train_vec, y_train)
print('Linear Discriminant Analysis Score: ')
print(lda.score(val_vec, y_validation))
print('-----------------------------------')
# Accuracy: 0.5182 for DBOW
# Accuracy: 0.5088 for DMM

In [None]:
train_vec_combo = model_combine(model_dbow, model_dmm, x_train)
val_vec_combo = model_combine(model_dbow, model_dmm, x_validation)
test_vec_combo = model_combine(model_dbow, model_dmm, x_test)

In [None]:
lr = LogisticRegression()
lr.fit(train_vec_combo, y_train)
print('Logistic Regression Score: ')
print(lr.score(val_vec_combo, y_validation))
print('-----------------------------------')
# Accuracy: 0.516 for DBOW + DMM Combined

In [None]:
posavg, negavg = average_vectors(train_vec_combo, y_train, 200)
cos_similarity = cosine_sim(posavg, negavg, val_vec_combo, y_validation, 200)
print('Cosine Similarity Score: ')
print(cos_similarity)
print('-----------------------------------')
# Accuracy: 0.5116 for DBOW + DMM Combined

In [None]:
lda = LinearDiscriminantAnalysis(n_components=1)
lda.fit(train_vec_combo, y_train)
print('Linear Discriminant Analysis Score: ')
print(lda.score(val_vec_combo, y_validation))
print('-----------------------------------')
# Accuracy: 0.5156 for DBOW + DMM Combined

## Experiment 6 (1000000 Tweets)

In [None]:
model_name_dbow = 'doc2vecmodel_dbow_1000000.model'
model_name_dmm = 'doc2vecmodel_dmm_1000000.model'
df_1000000 = pd.read_csv("tweets_sample_1000000.csv", encoding='latin-1', names=cats)

In [None]:
load_tweets(df_1000000)
x = df_1000000.tweet
y = df_1000000.tag
x_train, x_remain, y_train, y_remain = train_test_split(x, y, test_size=.02, random_state=2000)
x_validation, x_test, y_validation, y_test = train_test_split(x_remain, y_remain, test_size=.5, random_state=2000)

In [None]:
# make_tagged_docs(df_1000000)

In [None]:
# train_dbow_d2v(model_name_dbow, 100)

In [None]:
# train_dmm_d2v(model_name_dmm, 100)

In [None]:
model_dbow = Doc2Vec.load(model_name_dbow)
print('LOADED DBOW (1000000) MODEL\n')
model_dmm = Doc2Vec.load(model_name_dmm)
print('LOADED DMM (1000000) MODEL\n')

In [None]:
train_vec = model_to_vec(model_dbow, x_train)
val_vec = model_to_vec(model_dbow, x_validation)
test_vec = model_to_vec(model_dbow, x_test)

In [None]:
train_vec = model_to_vec(model_dmm, x_train)
val_vec = model_to_vec(model_dmm, x_validation)
test_vec = model_to_vec(model_dmm, x_test)

In [None]:
lr = LogisticRegression()
lr.fit(train_vec, y_train)
print('Logistic Regression Score: ')
print(lr.score(val_vec, y_validation))
print('-----------------------------------')
# Accuracy: 0.5796 for DBOW
# Accuracy: 0.5779 for DMM

In [None]:
posavg, negavg = average_vectors(train_vec, y_train, 100)
cos_similarity = cosine_sim(posavg, negavg, val_vec, y_validation, 100)
print('Cosine Similarity Score: ')
print(cos_similarity)
print('-----------------------------------')
# Accuracy: 0.5757 for DBOW
# Accuracy: 0.5706 for DMM

In [None]:
lda = LinearDiscriminantAnalysis(n_components=1)
lda.fit(train_vec, y_train)
print('Linear Discriminant Analysis Score: ')
print(lda.score(val_vec, y_validation))
print('-----------------------------------')
# Accuracy: 0.5799 for DBOW
# Accuracy: 0.5777 for DMM

In [None]:
train_vec_combo = model_combine(model_dbow, model_dmm, x_train)
val_vec_combo = model_combine(model_dbow, model_dmm, x_validation)
test_vec_combo = model_combine(model_dbow, model_dmm, x_test)

In [None]:
lr = LogisticRegression()
lr.fit(train_vec_combo, y_train)
print('Logistic Regression Score: ')
print(lr.score(val_vec_combo, y_validation))
print('-----------------------------------')
# Accuracy: 0.5897 for DBOW + DMM Combined

In [None]:
posavg, negavg = average_vectors(train_vec_combo, y_train, 200)
cos_similarity = cosine_sim(posavg, negavg, val_vec_combo, y_validation, 200)
print('Cosine Similarity Score: ')
print(cos_similarity)
print('-----------------------------------')
# Accuracy: 0.5809 for DBOW + DMM Combined

In [None]:
lda = LinearDiscriminantAnalysis(n_components=1)
lda.fit(train_vec_combo, y_train)
print('Linear Discriminant Analysis Score: ')
print(lda.score(val_vec_combo, y_validation))
print('-----------------------------------')
# Accuracy: 0.591 for DBOW + DMM Combined