In [1]:
import scipy
import pickle
import pandas as pd
from scipy import sparse

## Load Vectorizer

In [2]:
# Vectorizer used for training data
vect = pickle.load(open("tfidf_vect.pickle", "rb"))



In [3]:
vect.vocabulary_

{'white': 47284,
 'is': 19802,
 'right': 33453,
 'white is': 47295,
 'is right': 20210,
 'sounds': 36534,
 'like': 22822,
 'job': 21151,
 'for': 13530,
 'the': 39265,
 'sounds like': 36548,
 'job for': 21158,
 'for the': 13800,
 'price': 31347,
 'to': 42998,
 'performance': 30232,
 'wise': 47805,
 'they': 41830,
 'blow': 5447,
 'alienware': 789,
 'out': 29390,
 'of': 27553,
 'water': 46312,
 'price to': 31354,
 'out of': 29427,
 'of the': 28023,
 'the water': 41267,
 'eww': 12321,
 'cartoons': 7010,
 'are': 2709,
 'little': 23119,
 'kids': 21841,
 'are for': 2808,
 'for little': 13677,
 'little kids': 23128,
 'besides': 5035,
 'we': 46391,
 'gotta': 15639,
 'save': 34137,
 'all': 797,
 'that': 38658,
 'money': 25192,
 'more': 25293,
 'productive': 31557,
 'uses': 45258,
 'bombing': 5520,
 'brown': 5899,
 'people': 30033,
 'in': 18849,
 'other': 29259,
 'countries': 8855,
 'we gotta': 46443,
 'all that': 912,
 'that money': 38974,
 'money for': 25202,
 'for more': 13702,
 'more producti

## Load Files

In [4]:
# Load sparse mtx
train_tf_idf = sparse.load_npz("train_tf_idf.npz")
test_tf_idf = sparse.load_npz("test_tf_idf.npz")

# Load extra columns
rem_train_feats = pd.read_csv('rem_train_feats.csv')
rem_test_feats = pd.read_csv('rem_test_feats.csv')

# Load labels
y_train = pd.read_csv('y_train.csv').label.values
y_test = pd.read_csv('y_test.csv').label.values

In [5]:
rem_train_feats.head(2)

Unnamed: 0,quoted_word_count,stopword_count,word_count,weird_caps
0,0,1,3,0
1,0,3,7,0


In [6]:
# Concat the sparse mtx and dataframe together
from scipy.sparse import hstack
X_train = hstack([train_tf_idf, rem_train_feats])
X_test = hstack([test_tf_idf, rem_test_feats])

In [7]:
print(X_train.shape)
print(X_test.shape)

(698414, 50004)
(299321, 50004)


## Insert Models Here

In [8]:
from sklearn.linear_model import LogisticRegression

lc = LogisticRegression(verbose=1, n_jobs=2, solver='lbfgs')

In [9]:
lc.fit(X_train, y_train)
y_hat = lc.predict(X_test)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 out of   1 | elapsed:   33.1s finished


In [10]:
# Using Trees for prediction
from sklearn.ensemble import ExtraTreesClassifier

clf = ExtraTreesClassifier(n_estimators=1, n_jobs=4, verbose=1)

In [None]:
import time
# Fitting to model, get the predicted y_hat for test data

start = time.time()
print("Model fitting start time...")

clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)

end = time.time()
print(f'Model fitting time taken: {end - start}s')

Model fitting start time...


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import (confusion_matrix, mean_squared_error, 
                             classification_report, accuracy_score, 
                             precision_score, recall_score, f1_score, 
                             roc_auc_score)

def evaluation(y, y_hat, title = 'Confusion Matrix'):
    cm = confusion_matrix(y, y_hat)
    precision = precision_score(y, y_hat)
    recall = recall_score(y, y_hat)
    accuracy = accuracy_score(y, y_hat)
    f1 = f1_score(y,y_hat)
    print('Recall: ', recall)
    print('Accuracy: ', accuracy)
    print('Precision: ', precision)
    print('F1: ', f1)
    sns.heatmap(cm,  cmap= 'PuBu', annot=True, fmt='g', annot_kws=    {'size':20})
    plt.xlabel('predicted', fontsize=18)
    plt.ylabel('actual', fontsize=18)
    plt.title(title, fontsize=18)    
    plt.show()

evaluation(y_test, y_hat)