In [2]:
import pandas as pd
import numpy as np
data = pd.read_csv("a_affirmative.csv")
data = data.drop(data.columns[0], axis=1)

In [3]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.25)
X = train.drop(train.columns[-1], axis=1)
y = train[train.columns[-1]]
y_true = test[test.columns[-1]]
X_test = test.drop(train.columns[-1], axis=1)

In [4]:
def print_stats(model):
    print("Statistics on Testing")
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    print("True Negative:", tn, "False Positives:", fp, "False Negatives:", fn, "True Positives:", tp)
    accuracy = model.score(X_test, y_true)
    error = 1 - accuracy
    print("Accuracy:", accuracy, "Error:", error)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)
    print("Precision:", precision, "Recall:", recall, "F1:", f1)
    print("Statistics on Training")
    y_pred = model.predict(X)
    cm = confusion_matrix(y, y_pred)
    tn, fp, fn, tp = cm.ravel()
    print("True Negative:", tn, "False Positives:", fp, "False Negatives:", fn, "True Positives:", tp)
    accuracy = model.score(X, y)
    error = 1 - accuracy
    print("Accuracy:", accuracy, "Error:", error)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)
    print("Precision:", precision, "Recall:", recall, "F1:", f1)

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
model = RandomForestClassifier()
model.fit(X,y)
print_stats(model)

Statistics on Testing
True Negative: 165 False Positives: 4 False Negatives: 17 True Positives: 80
Accuracy: 0.9210526315789473 Error: 0.07894736842105265
Precision: 0.9523809523809523 Recall: 0.8247422680412371 F1: 0.883977900552486
Statistics on Training
True Negative: 478 False Positives: 1 False Negatives: 3 True Positives: 314
Accuracy: 0.9949748743718593 Error: 0.005025125628140725
Precision: 0.9968253968253968 Recall: 0.9905362776025236 F1: 0.9936708860759493




In [6]:
model2 = RandomForestClassifier(n_estimators=50)
model2.fit(X,y)
print_stats(model2)

Statistics on Testing
True Negative: 163 False Positives: 6 False Negatives: 13 True Positives: 84
Accuracy: 0.9285714285714286 Error: 0.0714285714285714
Precision: 0.9333333333333333 Recall: 0.865979381443299 F1: 0.8983957219251337
Statistics on Training
True Negative: 479 False Positives: 0 False Negatives: 0 True Positives: 317
Accuracy: 1.0 Error: 0.0
Precision: 1.0 Recall: 1.0 F1: 1.0


In [7]:
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
col = list(X)
for f in range(X.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, col[indices[f]], importances[indices[f]]))

1. feature 56y (0.079645)
2. feature 51y (0.044687)
3. feature 64y (0.044446)
4. feature 65y (0.043404)
5. feature 46y (0.042312)
6. feature 37y (0.042229)
7. feature 5y (0.039268)
8. feature 44y (0.038877)
9. feature 45y (0.038641)
10. feature 48z (0.015025)
11. feature 43y (0.013527)
12. feature 73x (0.011465)
13. feature 60z (0.010808)
14. feature 98y (0.010480)
15. feature 35x (0.009939)
16. feature 8y (0.009609)
17. feature 76x (0.009079)
18. feature 30y (0.008910)
19. feature 0y (0.008857)
20. feature 22y (0.008350)
21. feature 50y (0.007831)
22. feature 3z (0.007378)
23. feature 42y (0.007099)
24. feature 75y (0.007084)
25. feature 59y (0.007067)
26. feature 82y (0.006976)
27. feature 68x (0.006865)
28. feature 76y (0.006756)
29. feature 57y (0.006749)
30. feature 74z (0.006724)
31. feature 55y (0.006434)
32. feature 72x (0.006159)
33. feature 19z (0.005791)
34. feature 79z (0.005738)
35. feature 91y (0.005467)
36. feature 81y (0.005357)
37. feature 34x (0.005133)
38. feature 60

In [8]:
from sklearn.model_selection import cross_val_score
model2 = RandomForestClassifier(n_estimators=50)
cvdata = data.sample(frac=1)
x_data = cvdata.drop(cvdata.columns[-1], axis=1)
y_data = cvdata[cvdata.columns[-1]]
scores = cross_val_score(model2, x_data, y_data, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.92 (+/- 0.04)


# SVM 

In [6]:
from sklearn.svm import SVC  
svcModel = SVC(kernel='linear')  
svcModel.fit(X, y)
print_stats(svcModel)

Statistics on Testing
True Negative: 143 False Positives: 14 False Negatives: 25 True Positives: 84
Accuracy: 0.8533834586466166 Error: 0.14661654135338342
Precision: 0.8571428571428571 Recall: 0.7706422018348624 F1: 0.8115942028985508
Statistics on Training
True Negative: 456 False Positives: 35 False Negatives: 45 True Positives: 260
Accuracy: 0.8994974874371859 Error: 0.10050251256281406
Precision: 0.8813559322033898 Recall: 0.8524590163934426 F1: 0.8666666666666666


# Regular Decision Tree

In [9]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X,y)
print_stats(model)

Statistics on Testing
True Negative: 150 False Positives: 19 False Negatives: 17 True Positives: 80
Accuracy: 0.8646616541353384 Error: 0.13533834586466165
Precision: 0.8080808080808081 Recall: 0.8247422680412371 F1: 0.8163265306122448
Statistics on Training
True Negative: 479 False Positives: 0 False Negatives: 0 True Positives: 317
Accuracy: 1.0 Error: 0.0
Precision: 1.0 Recall: 1.0 F1: 1.0


In [10]:
import warnings;warnings.filterwarnings('ignore')
def train(file):
    data = pd.read_csv(file)
    data = data.drop(data.columns[0], axis=1)
    train, test = train_test_split(data, test_size=0.2)
    X = train.drop(train.columns[-1], axis=1)
    y = train[train.columns[-1]]
    y_true = test[test.columns[-1]]
    X_test = test.drop(train.columns[-1], axis=1)
    model = RandomForestClassifier()
    cvdata = data.sample(frac=1)
    scores = cross_val_score(model, x_data, y_data, cv=10)
    model.fit(X,y)
    return (model, model.score(X_test, y_true), scores)
files = ["affirmative", "conditional", 
 "doubt_question", "emphasis", "negative","relative",
"topics", "wh_question", "yn_question"]
models = []
for file in files:
    m, score, cvscores = train("a_" + file + ".csv")
    models.append(m)
    print(score)
    print("Accuracy: %0.2f (+/- %0.2f)" % (cvscores.mean(), cvscores.std() * 2))

0.8873239436619719
Accuracy: 0.91 (+/- 0.05)
0.9554973821989529
Accuracy: 0.90 (+/- 0.06)
0.9125475285171103
Accuracy: 0.91 (+/- 0.06)
0.9644128113879004
Accuracy: 0.90 (+/- 0.06)
0.9288888888888889
Accuracy: 0.91 (+/- 0.05)
0.9742489270386266
Accuracy: 0.91 (+/- 0.04)
0.9694444444444444
Accuracy: 0.90 (+/- 0.05)
0.9573643410852714
Accuracy: 0.91 (+/- 0.04)
0.960431654676259
Accuracy: 0.90 (+/- 0.04)


# Binary Classifier built on Person A tested on Person B

In [11]:
for i in range(len(files)):
    file = files[i]
    model = models[i]
    forFun = pd.read_csv("b_" + file + ".csv")
    forFun = forFun.drop(forFun.columns[0], axis=1)
    X = forFun.drop(forFun.columns[-1], axis=1)
    y_true = forFun[forFun.columns[-1]]
    print(model.score(X, y_true))

0.49162011173184356
0.3136676499508358
0.6820307281229125
0.3950892857142857
0.3419721871049305
0.29044117647058826
0.3293150684931507
0.5865963855421686
0.4108170310701956


In [13]:
data = pd.read_csv("a_affirmative.csv")
data = data.drop(data.columns[0], axis=1)
y = data[data.columns[-1]].iloc[1:]
funf = data.drop(data.columns[-1], axis=1).diff().iloc[1:]
funf = pd.concat([funf, y], axis=1)
train, test = train_test_split(funf, test_size=0.25)
X = train.drop(train.columns[-1], axis=1)
y = train[train.columns[-1]]
y_true = test[test.columns[-1]]
X_test = test.drop(train.columns[-1], axis=1)

In [14]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X,y)
print_stats(model)

Statistics on Testing
True Negative: 141 False Positives: 24 False Negatives: 26 True Positives: 75
Accuracy: 0.8120300751879699 Error: 0.18796992481203012
Precision: 0.7575757575757576 Recall: 0.7425742574257426 F1: 0.75
Statistics on Training
True Negative: 482 False Positives: 0 False Negatives: 0 True Positives: 313
Accuracy: 1.0 Error: 0.0
Precision: 1.0 Recall: 1.0 F1: 1.0


In [15]:
datab = pd.read_csv("b_affirmative.csv")
datab = datab.drop(datab.columns[0], axis=1)
y = datab[datab.columns[-1]].iloc[1:]
X = datab.drop(datab.columns[-1], axis=1).diff().iloc[1:]
print(model.score(X, y))
y_pred = model.predict(X)
cm = confusion_matrix(y, y_pred)
tn, fp, fn, tp = cm.ravel()
print("True Negative:", tn, "False Positives:", fp, "False Negatives:", fn, "True Positives:", tp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)
print("Precision:", precision, "Recall:", recall, "F1:", f1)
print("Statistics on Training")
y_pred = model.predict(X)

0.706430568499534
True Negative: 479 False Positives: 66 False Negatives: 249 True Positives: 279
Precision: 0.808695652173913 Recall: 0.5284090909090909 F1: 0.6391752577319587
Statistics on Training


# Neural Nets for Binary

In [16]:
train, test = train_test_split(data, test_size=0.15)
X = train.drop(train.columns[-1], axis=1)
y = train[train.columns[-1]]
y_true = test[test.columns[-1]]
X_test = test.drop(train.columns[-1], axis=1)

In [17]:
import keras
model = keras.Sequential([
    keras.layers.Dense(300, activation="sigmoid"),
    keras.layers.Dense(200, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy",
             optimizer="sgd", metrics=["accuracy"])
model.fit(X.values, y.values, epochs=8)

Using TensorFlow backend.


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x119653e10>

# Combining Data for Multiclass

In [31]:
l= []
def efg(file):
    data = pd.read_csv("a_" + file + ".csv")
    data = data.drop(data.columns[0], axis=1)
    t = data[data[data.columns[-1]] == 1]
    t[t.columns[-1]] = file
    return t
for file in files:
    count = efg(file)
    l.append(count)

In [39]:
data = pd.concat(l)
train, test = train_test_split(data, test_size=0.2)
X = train.drop(train.columns[-1], axis=1)
y = train[train.columns[-1]]
y_true = test[test.columns[-1]]
X_test = test.drop(train.columns[-1], axis=1)
model = RandomForestClassifier()
scores = cross_val_score(model, x_data, y_data, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
model.fit(X,y)
print(model.score(X_test, y_true))
print(model.score(X,y))

Accuracy: 0.90 (+/- 0.06)
0.9730941704035875
0.9980359147025814


In [40]:
y_pred = model.predict(X_test)
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
print(recall_score(y_true, y_pred, average="weighted"))
print(precision_score(y_true, y_pred, average="weighted"))

0.9730941704035875
0.9734927669050176


In [43]:
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
col = list(X)
for f in range(X.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, col[indices[f]], importances[indices[f]]))

1. feature 45y (0.138088)
2. feature 76y (0.118896)
3. feature 68x (0.108831)
4. feature 44z (0.090481)
5. feature 77z (0.078817)
6. feature 38x (0.074893)
7. feature 41y (0.045530)
8. feature 80z (0.045471)
9. feature 99y (0.025123)
10. feature 92x (0.022713)
11. feature 82x (0.021149)
12. feature 98x (0.020614)
13. feature 81x (0.017445)
14. feature 93x (0.014384)
15. feature 76z (0.009687)
16. feature 68y (0.009622)
17. feature 83x (0.009506)
18. feature 61z (0.008281)
19. feature 18x (0.008225)
20. feature 48z (0.006732)
21. feature 74z (0.004608)
22. feature 99x (0.004502)
23. feature 70z (0.004102)
24. feature 58y (0.003753)
25. feature 73z (0.003689)
26. feature 50x (0.003659)
27. feature 72y (0.003164)
28. feature 60z (0.003148)
29. feature 57z (0.003137)
30. feature 60x (0.003112)
31. feature 39x (0.003031)
32. feature 85y (0.003000)
33. feature 86y (0.002991)
34. feature 64x (0.002981)
35. feature 97x (0.002933)
36. feature 45z (0.002885)
37. feature 95x (0.002770)
38. featur

In [41]:
model = DecisionTreeClassifier()
scores = cross_val_score(model2, x_data, y_data, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
model.fit(X,y)
print(model.score(X_test, y_true))

Accuracy: 0.92 (+/- 0.06)
0.9349775784753364


In [42]:
y_pred = model.predict(X_test)
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
print(recall_score(y_true, y_pred, average="weighted"))
print(precision_score(y_true, y_pred, average="weighted"))

0.9349775784753364
0.9381452528537729


In [18]:
def convert(y):
    files = ["affirmative", "conditional", 
 "doubt_question", "emphasis", "negative","relative",
"topics", "wh_question", "yn_question"]
    mapped = {}
    for i in range(len(files)):
        mapped[files[i]] = i
    value = [0 for i in range(len(files))]
    ans = []
    for i in y:
        hot = value[:]
        hot[mapped[i]] = 1
        ans.append(hot)
    return np.asarray(ans)
dummy_y = convert(y)

In [19]:
import keras
model = keras.Sequential([
    keras.layers.Dense(300, activation="sigmoid"),
    keras.layers.Dense(200, activation="tanh"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(9, activation="softmax")
])
model.compile(loss="categorical_crossentropy",
             optimizer="sgd", metrics=["accuracy"])
model.fit(X.values, np.asarray(dummy_y), epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1193ed198>

# Testing model against other person

In [20]:
def efg(person, file):
    data = pd.read_csv(person + "_" + file + ".csv")
    data = data.drop(data.columns[0], axis=1)
    y = data[data.columns[-1]].iloc[1:]
    funf = data.drop(data.columns[-1], axis=1).diff().iloc[1:]
    funf = pd.concat([funf, y], axis=1)
    t = funf[funf[funf.columns[-1]] == 1]
    t[t.columns[-1]] = file
    return t

In [21]:
l= []
for file in files:
    count = efg("a", file)
    l.append(count)
data_a = pd.concat(l)
l= []
for file in files:
    count = efg("b", file)
    l.append(count)
data_b = pd.concat(l)

In [22]:
train, test = train_test_split(data_a, test_size=0.25)
X = train.drop(train.columns[-1], axis=1)
y = train[train.columns[-1]]
y_true = test[test.columns[-1]]
X_test = test.drop(train.columns[-1], axis=1)

In [23]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X,y)
print(model.score(X_test, y_true))

0.533213644524237


In [24]:
y_b = data_b[data_b.columns[-1]].iloc[1:]
X_b = data_b.drop(data_b.columns[-1], axis=1).diff().iloc[1:]
model.score(X_b, y_b)

0.16955719557195573