In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report, auc, roc_curve, precision_recall_curve

#parsing train input
df = pd.read_csv('/content/drive/Shareddrives/systematic_review_research/data/raw_data/train/acc.csv')
df = df.replace({'label': {'Include': 1, 'Exclude': 0}})

#parsing test input
df_test = pd.read_csv('/content/drive/Shareddrives/systematic_review_research/data/raw_data/test/acc.csv')
df_test = df_test.replace({'label': {'Include': 1, 'Exclude': 0}})

#create balanced train dataset
includes = df.loc[df['label'] == 1]
excludes = df.loc[df['label'] == 0]
new_train = pd.DataFrame()
new_train = new_train.append([includes, excludes.sample(frac = 1)[0:len(includes)]])

def create_vectors(train, test):
  print("Creating TF-IDF feature vectors ...")
  #Create feature vectors
  vectorizer = TfidfVectorizer(min_df = 5, max_df = 0.8, sublinear_tf = True, use_idf = True)
  train_vectors = vectorizer.fit_transform(train)
  test_vectors = vectorizer.transform(test)
  return train_vectors, test_vectors

train_vectors, test_vectors = create_vectors(new_train.text_segment, df_test.text_segment)

#train model
classifier_linear = svm.SVC(kernel = 'linear', probability = True)
classifier_linear.fit(train_vectors, new_train['label'])

pred_labels= classifier_linear.predict(test_vectors)
pred_probs= classifier_linear.predict_proba(test_vectors)

#extract metrics
report= classification_report(df_test.label, pred_labels, output_dict=True)
print(df_test.label)
print(pred_labels)
print(report)

Creating TF-IDF feature vectors ...
0      1
1      1
2      1
3      1
4      1
      ..
139    0
140    0
141    0
142    0
143    0
Name: label, Length: 144, dtype: int64
[1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0
 0 0 0 1 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
{'0': {'precision': 0.9587628865979382, 'recall': 0.8378378378378378, 'f1-score': 0.8942307692307693, 'support': 111}, '1': {'precision': 0.6170212765957447, 'recall': 0.8787878787878788, 'f1-score': 0.7250000000000001, 'support': 33}, 'accuracy': 0.8472222222222222, 'macro avg': {'precision': 0.7878920815968414, 'recall': 0.8583128583128583, 'f1-score': 0.8096153846153846, 'support': 144}, 'weighted avg': {'precision': 0.8804471009724355, 'recall': 0.8472222222222222, 'f1-score': 0.855448717948718, 'support': 144}}
