In [1]:
import warnings
import pickle
import gensim
import numpy as np
from sklearn.linear_model import LogisticRegression



In [2]:
warnings.filterwarnings('ignore')

### Loading data:
***

In [3]:
with open('../data/clean-data-nostem.pkl','rb') as fp:
    X,Y = pickle.load(fp)
fp.close()

### Tokenizing:
***

In [4]:
X = [i.split() for i in X]

### Loading pre-trained model:
***

In [5]:
model = gensim.models.KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin', binary=True)  

### Creating mean weighted sentence vector:
***

In [7]:
sen_vector = np.empty((len(X),300)); index = 0
for i in X:
    counter = 0
    vector = np.zeros(300)
    for word in i:
        try:
            vector += model[word]
            counter = counter+1
        except:
            # Blank
            _ = None
    vector = vector/counter
    sen_vector[index] = vector
    index += 1

In [8]:
l = int(len(X)*0.8)

In [9]:
pred = {}

### Logistic Regression (L1):
***

In [10]:
lr = LogisticRegression(penalty='l1',C=1,n_jobs=-1,class_weight='balanced')

In [None]:
lr.fit(sen_vector[:l],Y[:l])

In [None]:
flag = sen_vector[l:]

### Remomving NAN before predicting:
***

In [None]:
import math
for i in range(flag.shape[0]):
    for j in range(300):
        if(math.isnan(flag[i][j])):
            flag[i][j]=1e5

In [None]:
pred['Logistic L1 with pre-trained w2v'] = lr.predict(flag)

In [None]:
from sklearn.metrics import roc_curve,auc
import matplotlib.pyplot as plt
plt.figure(figsize=(12,6))

def formatt(x):
    return x;

vfunc = np.vectorize(formatt)

cmp = 0
colors = ['b', 'g', 'y', 'm', 'k']

for model, predicted in pred.items():
    false_positive_rate, true_positive_rate, thresholds = roc_curve(Y[l:], vfunc(predicted))
    roc_auc = auc(false_positive_rate, true_positive_rate)
    plt.plot(false_positive_rate, true_positive_rate, colors[cmp], label='%s: AUC %0.2f'% (model,roc_auc))
    cmp += 1

plt.title('Classifiers comparaison with ROC using W2V-non stemmed')
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()