# Binary Classification 

Now that we all the features for all the data/lists extracted from libre help and ubuntu help, let us classify them into procedures and non-procedures.

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_curve, f1_score

DATA_PATH = 'data'

In [2]:
df = pd.read_csv(os.path.join(DATA_PATH, 'dense_features_train_procedures.csv'), encoding='utf-8')

In [3]:
feat = df[['Sents-No Subject', 'Sents-Starts with Verb', 'Avg Length', 'Gerunds', 'Infinitives']].values
labels = df['Labels']

In [4]:
clf = RandomForestClassifier(n_estimators=100)
N = 50

We will evaluate our model with cross-validation. Since our dataset is class-imbalanced we will use StratifiedKFold. The folds are made by preserving the percentage of samples for each class.

In [5]:
skf = StratifiedShuffleSplit(n_splits=5, random_state=42)
scores = cross_val_score(clf, feat, labels, cv=skf)
scores

array([0.89705882, 0.83823529, 0.80882353, 0.86764706, 0.85294118])

In [8]:
test_df = pd.read_csv(os.path.join(DATA_PATH, 'dense_features_test_procedures.csv'), encoding='utf-8')
X_test = df[['Sents-No Subject', 'Sents-Starts with Verb', 'Avg Length', 'Gerunds', 'Infinitives']].values
y_test = test_df['Labels']

In [9]:
clf.fit(feat, labels)
pickle.dump(clf, open('classify_proc.pkl','wb'), protocol=2)
clf = pickle.load(open('classify_proc.pkl', 'rb'))
y_predict = clf.predict(X_test)
print (y_predict)
test_acc = accuracy_score(y_test, y_predict)
print ('Test accuracy:', test_acc, 'F1 Score:', f1_score(y_test, y_predict))

[1 0 1 0 1 0 0 1 0 1 1 0 1 1 0 1 0 1 0 0 1 1 0 1 1 1 0 1 1 1 1 0 0 0 1 1 1
 0 0 1 0 1 1 0 0 1 1 1 1 0 0 0 1 0 0 1 0 0 1 0 0 1 0 1 1 1 0 0 1 0 1 0 1 1
 1 0 1 1 1 1 0 0 1 1 0 1 0 1 1 1 1 1 1 1 0 1 0 1 1 1 0 0 1 1 0 0 1 1 1 0 0
 1 1 1 1 1 0 1 0 1 1 1 0 1 0 1 0 0 1 0 0 1 0 1 0 1 1 0 0 0 0 1 1 0 0 1 0 0
 1 1 0 1 1 1 0 0 1 0 1 0 1 0 1 1 0 0 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0 1 0 0 1
 0 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1 0
 0 1 1 1 0 1 1 1 1 1 0 1 0 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1
 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 1 1 1 0 0 1 1 1 0 1 0 0 1 0 1 1 1 1 0 0 0 1
 0 0 1 0 0 1 1 0 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 0
 0 1 0 1 0 0 0 0 1 1 1 1 1 1 1 0 1 0 1 0 0 0 1 1 0 0 1 0 1 1 1 0 1 0 1 1 1
 1 0 1 0 1 1 0 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 0 0 1 0 0 1 0 1 1 0 0
 1 1 1 0 1 0 1 1 0 0 0 0 1 1 1 1 1 1 1 0 1 0 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1
 0 1 0 1 1 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1
 1 1 0 1 0 0 1 0 1 1 1 1 