In [28]:
from sklearn.naive_bayes import BernoulliNB, GaussianNB
import numpy as np
import pandas as pd
import my_globals
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from utils import get_sub_featured_datasets, get_sub_dataset, get_entire_dataset
from preprocessing import cleaning, preprocess_pipeline
from w2v import word_embedding

data = get_sub_dataset(size = 30000, random_seed=42)
# data = get_entire_dataset()
data = cleaning(data)
data["processed_text"] = data["text"].apply(
    lambda s:
    preprocess_pipeline(
        s,
        pipeline = "w2v"
    )
)
data.head(5)



Unnamed: 0,index,target,ids,date,user,text,weekday_Mon,weekday_Tue,weekday_Wed,weekday_Thu,weekday_Fri,weekday_Sat,weekday_Sun,datetime,processed_text
0,121958,0,1833617173,Sun May 17 23:52:31 PDT 2009,dindahh,@ home studying for maths wooot ! im so going ...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2009-05-17 23:52:31,home studying for math wooot ! i am so going t...
1,671155,0,2246780174,Fri Jun 19 18:06:46 PDT 2009,MizSadittyFancy,Pickin up @misstinayao waitin on @sadittysash ...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2009-06-19 18:06:46,pickin up waitin on 2 hurry odeeee missed dem ...
2,131932,0,1835639354,Mon May 18 06:26:21 PDT 2009,lordmuttley,@ProudGamerTweet I rather average 32370,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2009-05-18 06:26:21,i rather average 32370
3,1414414,4,2057029784,Sat Jun 06 12:14:24 PDT 2009,beeluz,@officialnjonas Good luck with that,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2009-06-06 12:14:24,good luck with that
4,259178,0,1985361990,Sun May 31 16:57:39 PDT 2009,lutheasalom,this song's middle change just doesn't want to...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2009-05-31 16:57:39,this song 's middle change just doe not want t...


In [30]:
w2v_df = word_embedding(
    data, 
    vector_size=5000, 
    w2v_epochs = 30, 
    aggregate="l3", 
    colname = "processed_text"
)
w2v_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0.000438,0.000637,-0.000348,0.000308,-0.004065,0.000010,-9.817567e-05,-1.821332e-04,-0.000635,0.001278,...,0.000239,-0.001441,0.000158,0.000009,1.153380e-04,-0.000091,-0.001350,-0.001103,-0.000291,-0.000679
1,0.000593,0.000392,-0.000066,0.000837,-0.000160,-0.000205,1.794127e-03,2.467373e-04,-0.000017,0.000160,...,0.000015,-0.001462,-0.000536,0.000465,5.846696e-05,0.000202,-0.001143,-0.000520,-0.000304,-0.000015
2,0.000105,0.000668,0.000120,0.000024,-0.001872,0.000022,4.061656e-07,-2.774326e-07,-0.000010,0.000529,...,0.000819,-0.003617,0.000115,-0.000702,3.856506e-04,-0.003878,-0.000358,-0.000185,-0.000254,0.000736
3,-0.000097,-0.000945,0.001211,0.000051,-0.000161,-0.000088,-1.121601e-03,-1.124408e-03,0.000102,0.001526,...,0.000593,-0.000391,-0.001428,0.000440,2.661788e-04,-0.000415,-0.000557,-0.002171,0.000432,-0.000826
4,0.000869,0.000622,0.000166,0.000730,-0.002046,-0.000097,1.466231e-03,5.525903e-04,-0.000146,0.000725,...,0.000029,-0.000696,-0.000584,-0.000031,6.752872e-04,-0.000862,-0.000359,-0.000437,-0.000130,-0.000635
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,-0.000263,0.001827,-0.000432,-0.000046,-0.000563,-0.002653,1.430265e-04,2.249070e-05,0.000103,0.000239,...,0.001687,-0.000527,-0.001882,0.001672,2.703756e-07,0.000348,0.000096,-0.003047,-0.000902,0.000624
29996,0.001612,-0.000167,0.000134,-0.000176,-0.000235,-0.000027,-1.718638e-04,-1.533485e-04,-0.000369,0.000105,...,0.000222,-0.000967,-0.000410,-0.000047,1.037867e-04,0.000065,-0.000389,-0.000701,-0.000294,-0.000518
29997,0.000677,0.002032,-0.000053,0.000399,-0.000672,-0.000187,1.360294e-03,8.800548e-04,0.000301,0.000397,...,0.000220,-0.001804,-0.000480,-0.000052,-5.072658e-04,0.000200,-0.001025,-0.000451,-0.000086,0.000062
29998,0.000231,0.000447,0.000026,-0.001370,-0.000357,-0.001189,4.607260e-03,2.404717e-04,0.000007,0.003395,...,-0.000195,-0.000111,-0.000135,0.000373,-3.283995e-05,0.000091,-0.000930,-0.000048,0.000492,0.000100


## Testing whether W2V can be used with classical models

In [31]:
import feature_engineering as fe

w2v_df["exclaim_freq"] = data["text"].apply(fe.exclaim_freq)
w2v_df["mention_count"] = data["text"].apply(fe.mention_count)
w2v_df["cap_freq"] = data["text"].apply(fe.cap_freq)

In [32]:
XX = w2v_df
yy = data[["target"]]
X_train, X_test, y_train, y_test = train_test_split(XX, yy, test_size=0.2)

In [33]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

bnb = BernoulliNB()
X_input = X_train
bnb.fit(X_input, y_train)
y_pred = bnb.predict(
    X_test
)

def assess(y_true, y_pred):
    print("confusion matrix:")
    print(confusion_matrix(y_true, y_pred))
    print()
    print("accuracy_score:")
    print(accuracy_score(y_true, y_pred))
    print()
    print("classification report:")
    print(classification_report(y_true,y_pred))
    print()

assess(y_test, y_pred)

  y = column_or_1d(y, warn=True)


confusion matrix:
[[2387  659]
 [1072 1882]]

accuracy_score:
0.7115

classification report:
              precision    recall  f1-score   support

           0       0.69      0.78      0.73      3046
           4       0.74      0.64      0.68      2954

    accuracy                           0.71      6000
   macro avg       0.72      0.71      0.71      6000
weighted avg       0.71      0.71      0.71      6000


