In [16]:
from gensim.utils import simple_preprocess
from gensim.parsing.porter import PorterStemmer
from gensim.models import Word2Vec
import numpy as np
import pandas as pd

def word_embedding(
        df: pd.DataFrame, 
        vector_size=1000, 
        w2v_epochs = 30,
        aggregate = "mean",
        colname = "text"
    ):
    word_vector = []
    text_col = df[colname]
    tokenized_text = [simple_preprocess(line, deacc=True) for line in text_col]
    porter_stemmer = PorterStemmer()
    stemmed_tokens = [[porter_stemmer.stem(word) for word in tokens] for tokens in tokenized_text]
    
    w2v_model = Word2Vec(sentences=stemmed_tokens, vector_size=vector_size, window=5, min_count=1, workers=4, sg=1)
    # below is added to Mike's version

    w2v_model.build_vocab(stemmed_tokens)
    w2v_model.train(
        stemmed_tokens, 
        total_examples=len(stemmed_tokens), 
        epochs=w2v_epochs
    )

    # above is added to Mike's version
    
    for index, row in enumerate(stemmed_tokens):
        model_vector = np.zeros((vector_size, len(row)))
        for tok_id, token in enumerate(row):
            if token in w2v_model.wv:
                model_vector[:, tok_id] = w2v_model.wv[token]
        
        if len(stemmed_tokens[index]) == 0:
            word_vector.append([0]*vector_size)
        else:
            mu = np.mean(model_vector, axis = 1)
            M = np.max(model_vector, axis = 1)
            if aggregate == "mean":
                word_vector.append(mu)
            elif aggregate == "max":
                word_vector.append(M)
            elif aggregate == "mean_max":
                word_vector.append(M + mu)
    
    return pd.DataFrame(word_vector, index=None)

In [17]:
from sklearn.naive_bayes import BernoulliNB, GaussianNB
import numpy as np
import pandas as pd
import my_globals
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from utils import get_sub_featured_datasets, get_sub_dataset, get_entire_dataset
from preprocessing import cleaning, preprocess_pipeline

data = get_sub_dataset(size = 30000, random_seed=4)
# data = get_entire_dataset()
data = cleaning(data)
data["processed_text"] = data["text"].apply(
    lambda s:
    preprocess_pipeline(
        s,
        pipeline = "w2v"
    )
)
data.head(5)



Unnamed: 0,index,target,ids,date,user,text,weekday_Mon,weekday_Tue,weekday_Wed,weekday_Thu,weekday_Fri,weekday_Sat,weekday_Sun,datetime,processed_text
0,1000570,4,1880006855,Thu May 21 23:48:34 PDT 2009,JohnnyEugenio2,Omgosh I put my phone back on the hook so the ...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2009-05-21 23:48:34,omgosh i put phone back hook battery die
1,991406,4,1835115440,Mon May 18 05:10:48 PDT 2009,BalaSN,leavin ma office,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2009-05-18 05:10:48,leavin office
2,1534391,4,2178760886,Mon Jun 15 08:10:05 PDT 2009,eltorgie,thunder! ... 399/1000 words,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2009-06-15 08:10:05,thunder word
3,1426117,4,2059166942,Sat Jun 06 16:22:57 PDT 2009,naughtymeg,@chasesterling guess its just me and you!!,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2009-06-06 16:22:57,guess
4,120705,0,1833295442,Sun May 17 22:51:09 PDT 2009,Rachel_Butts,@zeneth7 Keen-o! I'm gonna miss you too I'm g...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2009-05-17 22:51:09,keeno i going miss i glad hear got home okay i...


In [18]:
w2v_df = word_embedding(data, vector_size=1000, w2v_epochs = 30, aggregate="mean", colname = "processed_text")
w2v_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.226939,-0.077200,0.046836,0.027473,-0.174015,-0.039666,-0.022925,-0.000811,-0.063925,-0.015890,...,-0.031321,0.003946,0.112615,0.034528,0.100905,0.115833,-0.158789,-0.056987,-0.090856,-0.008623
1,0.150168,0.035708,0.033201,0.116680,0.019111,0.012736,-0.060394,0.130485,-0.045609,0.044399,...,0.115715,0.030940,0.126036,-0.031151,0.171666,0.156100,-0.292001,-0.069712,0.036156,-0.168498
2,0.321827,0.137617,0.054271,-0.053929,-0.148799,0.123119,-0.022908,-0.106843,-0.114478,0.139248,...,0.045055,0.070155,0.196001,0.019915,-0.017394,0.214227,-0.096710,-0.002220,0.189984,-0.122219
3,0.198427,0.075223,-0.147880,0.157327,-0.137368,0.004270,0.334117,0.037157,0.169680,0.279199,...,0.223727,-0.025341,0.391438,-0.002685,0.422625,0.203378,-0.300448,-0.249923,0.150071,0.120149
4,0.244343,0.148029,0.054895,0.098016,0.034683,0.021334,0.052525,0.080901,-0.104851,0.115776,...,-0.111992,0.046862,0.147491,-0.071254,0.135095,0.007895,-0.038765,-0.084445,0.079592,-0.040066
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0.138388,0.165650,0.018332,-0.017240,-0.037595,-0.093409,-0.026791,0.096395,-0.073940,0.025992,...,-0.085586,-0.070778,0.156444,-0.049311,0.114706,-0.055475,-0.116208,-0.088952,-0.091209,0.102135
29996,0.142648,0.114404,0.005378,0.223302,-0.078325,0.079454,-0.090101,0.113498,-0.236704,0.137507,...,-0.027308,0.024873,0.150598,-0.169766,0.058294,-0.060453,-0.122161,0.069476,0.011877,-0.305304
29997,0.110325,0.104602,0.068552,0.099350,0.014362,0.009816,0.210176,0.018738,-0.025489,0.065739,...,0.001471,-0.213453,0.229559,-0.069741,0.148155,0.106943,-0.094603,-0.114717,0.147705,-0.137738
29998,0.074069,0.008730,0.068054,-0.090557,0.033178,-0.020772,-0.149306,0.080679,-0.082706,0.002611,...,-0.061993,-0.174772,0.144231,-0.062404,0.017939,0.081912,-0.094581,-0.036566,0.052642,-0.048333


## Testing whether W2V can be used with classical models

In [19]:
import feature_engineering as fe

w2v_df["exclaim_freq"] = data["text"].apply(fe.exclaim_freq)
w2v_df["mention_count"] = data["text"].apply(fe.mention_count)
w2v_df["cap_freq"] = data["text"].apply(fe.cap_freq)

In [20]:
XX = w2v_df
yy = data[["target"]]
X_train, X_test, y_train, y_test = train_test_split(XX, yy, test_size=0.2)

In [21]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

bnb = BernoulliNB()
X_input = X_train
bnb.fit(X_input, y_train)
y_pred = bnb.predict(
    X_test
)

def assess(y_true, y_pred):
    print("confusion matrix:")
    print(confusion_matrix(y_true, y_pred))
    print()
    print("accuracy_score:")
    print(accuracy_score(y_true, y_pred))
    print()
    print("classification report:")
    print(classification_report(y_true,y_pred))
    print()

assess(y_test, y_pred)

  y = column_or_1d(y, warn=True)


confusion matrix:
[[2142  796]
 [1022 2040]]

accuracy_score:
0.697

classification report:
              precision    recall  f1-score   support

           0       0.68      0.73      0.70      2938
           4       0.72      0.67      0.69      3062

    accuracy                           0.70      6000
   macro avg       0.70      0.70      0.70      6000
weighted avg       0.70      0.70      0.70      6000




