In [10]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import precision_recall_fscore_support
from collections import Counter

In [3]:
train_features = pd.read_csv('../corpus/features/left_features_train.csv', usecols= lambda s: s !='common_word')
test_features = pd.read_csv('../corpus/features/left_features_test.csv', usecols= lambda s: s !='common_word')
train_features = train_features.values.tolist()
test_features = test_features.values.tolist()

In [59]:
len(train_features[0])

224

In [67]:
train_labels = pd.read_csv('../dataset/GAD_Y_N_newF_wPubmedID_annotated_preprocessed_train.csv',
                          usecols=[1])
train_labels['association'] = train_labels['association'].apply(lambda x: x != 'F')
train_labels = list(train_labels['association'])
test_labels = pd.read_csv('../dataset/GAD_Y_N_newF_wPubmedID_annotated_preprocessed_test.csv',
                          usecols=[1])
test_labels['association'] = test_labels['association'].apply(lambda x: x != 'F')
test_labels = list(test_labels['association'])

In [68]:
clf = LogisticRegression(solver='liblinear', multi_class='auto').fit(train_features, train_labels)
print(sum([p == a for p, a in zip(clf.predict(test_features),test_labels)]) / len(test_labels))
precision_recall_fscore_support(test_labels, clf.predict(test_features), average='weighted')

0.7598944591029023


(0.8191074786737312, 0.7598944591029023, 0.6783581385743588, None)

In [74]:
svm_clf = SVC(gamma='scale').fit(train_features, train_labels)
print(sum([p == a for p, a in zip(svm_clf.predict(test_features),test_labels)]) / len(test_labels))
precision_recall_fscore_support(test_labels, svm_clf.predict(test_features), average='weighted')

0.762532981530343


(0.8206091545256395, 0.762532981530343, 0.6836639460672478, None)

In [79]:
train_labels = pd.read_csv('../dataset/GAD_Y_N_newF_wPubmedID_annotated_preprocessed_train.csv',
                          usecols=[1])
train_labels = list(train_labels['association'])
test_labels = pd.read_csv('../dataset/GAD_Y_N_newF_wPubmedID_annotated_preprocessed_test.csv',
                          usecols=[1])
test_labels = list(test_labels['association'])

In [80]:
clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000).fit(train_features, train_labels)
print(sum([p == a for p, a in zip(clf.predict(test_features),test_labels)]) / len(test_labels))
precision_recall_fscore_support(test_labels, clf.predict(test_features), average='weighted')

0.6596306068601583




(0.7242367440763116, 0.6596306068601583, 0.6124582713198207, None)

In [31]:
print(Counter(train_labels))
print(Counter(test_labels))
print(Counter(clf.predict(test_features)))

Counter({'Y': 1239, 'N': 660, 'F': 350, 'P': 1})
Counter({'Y': 165, 'N': 113, 'F': 101})
Counter({'Y': 257, 'N': 104, 'F': 18})


In [82]:
svm_clf = SVC(gamma='scale').fit(train_features, train_labels)
print(sum([p == a for p, a in zip(svm_clf.predict(test_features),test_labels)]) / len(test_labels))
precision_recall_fscore_support(test_labels, svm_clf.predict(test_features), average='weighted')

0.6675461741424802


(0.7218500815721856, 0.6675461741424802, 0.6333834334598787, None)

In [45]:
def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))

In [59]:
train_splits = split(train_features, 3)
train_labels_splits = split(train_labels, 3)
clfs = []
for data, labels in zip(train_splits, train_labels_splits):
    eclf = SVC().fit(data, labels)
    clfs.append(eclf)
preds = [c.predict(test_features) for c in clfs]
pred = [max(set(x), key=x.count) for x in zip(*preds)]
sum([p == a for p, a in zip(pred,test_labels)]) / len(test_labels)



0.6649076517150396