## Building a few machine learning models

We will use the extracted features from the othernote book to build a few machine learning models and evaluate their performance.

In [2]:
import pickle
import numpy as np
import pandas as pd
from IPython.display import display

In [3]:
with open('pickles/extracted_features', 'rb') as f:
    extracted_features = pickle.load(f)

In [27]:
display(extracted_features.head())

extracted_features.fillna(0, inplace=True)

extracted_features.isnull().any(axis=1)

Unnamed: 0,char_length,label,tf-idf_wordshare,word_length,wordshare
0,9,0,0.946807,2,7.466667
1,37,0,0.475718,5,6.285714
2,14,0,0.322098,4,3.1875
3,15,0,0.0,6,1.05
4,37,0,0.233401,8,4.2


0         False
1         False
2         False
3         False
4         False
5         False
6         False
7         False
8         False
9         False
10        False
11        False
12        False
13        False
14        False
15        False
16        False
17        False
18        False
19        False
20        False
21        False
22        False
23        False
24        False
25        False
26        False
27        False
28        False
29        False
          ...  
404260    False
404261    False
404262    False
404263    False
404264    False
404265    False
404266    False
404267    False
404268    False
404269    False
404270    False
404271    False
404272    False
404273    False
404274    False
404275    False
404276    False
404277    False
404278    False
404279    False
404280    False
404281    False
404282    False
404283    False
404284    False
404285    False
404286    False
404287    False
404288    False
404289    False
Length: 404290, dtype: b

In [28]:
from sklearn.model_selection import train_test_split

full_labels = np.array(extracted_features['label'].values)

print(full_labels[:5])

[0 0 0 0 0]


In [29]:
full_features = extracted_features[['char_length', 'tf-idf_wordshare', 'word_length', 'wordshare']].values

print(full_features[:20])

[[  9.           0.94680699   2.           7.46666667]
 [ 37.           0.4757185    5.           6.28571429]
 [ 14.           0.32209849   4.           3.1875    ]
 [ 15.           0.           6.           1.05      ]
 [ 37.           0.23340095   8.           4.2       ]
 [  4.           0.57153494   1.           5.2       ]
 [ 43.           0.           7.           1.07142857]
 [ 11.           0.69316964   2.           3.3       ]
 [  6.           1.           4.           4.30769231]
 [ 11.           0.58657528   3.           5.25      ]
 [ 59.           0.          10.           1.05882353]
 [  5.           0.62918465   1.           4.30769231]
 [  3.           1.           1.           4.33333333]
 [  5.           0.90029772   1.           5.45454545]
 [  1.           0.90504189   0.          12.42857143]
 [  2.           0.33907381   2.           4.18181818]
 [  1.           0.62074718   0.           3.375     ]
 [ 17.           0.           4.           1.07692308]
 [  9.    

In [30]:
X_train, X_test, y_train, y_test = train_test_split(full_features, full_labels, test_size=0.2, random_state=42)

print(y_test[:10])
print(X_train.shape)

[0 0 0 1 1 1 0 0 1 1]
(323432, 4)


### Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression

# C is known as the regularization strength
logistic = LogisticRegression(C=1e5)
logistic.fit(X_train, y_train)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [32]:
predictions = logistic.predict(X_test)

print(predictions[:8])

[1 0 0 1 0 1 1 0]


In [34]:
### Evaluate accruacy 
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import fbeta_score

model_recall_score = recall_score(y_test, predictions, average=None)
model_precision_score = precision_score(y_test, predictions, average=None)
model_fbeta_score = fbeta_score(y_test, predictions, average='weighted', beta=1)

print('Recall score:', model_recall_score)
print('Precision score:', model_precision_score)
print('F-beta score:', model_fbeta_score)

Recall score: [ 0.78292621  0.42984528]
Precision score: [ 0.69889828  0.53948303]
F-beta score: 0.641862997896


### Support Vector Machine