
# Models using TFIDF vectors of abstracts




**ENVIRONMENT SETUP**

In [0]:
pip install prince

Collecting prince
  Downloading https://files.pythonhosted.org/packages/51/f4/8de7003b86351a0e32e29ca2bbbbbf58e311b09f9286e83e638d437aee6d/prince-0.7.0-py3-none-any.whl
Installing collected packages: prince
Successfully installed prince-0.7.0


In [0]:
import pandas as pd
import numpy as np
import prince
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV,cross_val_score
import pickle
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA
from tqdm import tqdm
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt

  import pandas.util.testing as tm


**DATA SETUP**

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
tfidf_df = pd.read_pickle("/content/drive/My Drive/ML/paper_data_tfidf_train.pkl") 
train_data_indices = pd.read_pickle("/content/drive/My Drive/ML/train_data_paper_ids.pkl").tolist() 
train_data_df = tfidf_df.loc[tfidf_df['paper_id'].isin(train_data_indices)]
test_data_df = tfidf_df.loc[~tfidf_df['paper_id'].isin(train_data_indices)]

In [0]:
#TRAIN DATA
X_train = np.vstack(train_data_df['tfidf_encoding'].values.tolist())
y_train = np.asarray([1 if item==True else 0 for item in list(train_data_df['accepted'])])

#TEST DATA
X_test = np.vstack(test_data_df['tfidf_encoding'].values.tolist())
y_test = np.asarray([1 if item==True else 0 for item in list(test_data_df['accepted'])])
print(X_train.shape)

(349, 6230)


**DIMENSION REDUCTION**

In [0]:
tfidf_vector_size = X_train.shape[1]+1
for i in tqdm(range(2,tfidf_vector_size)):
    pca = prince.PCA(n_components=i)
    pca = pca.fit(X_train)
    retained_variance = sum(pca.explained_inertia_)
    if retained_variance > 0.85:
        break
print("\nVariance retained for ",i," components = ",retained_variance)
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)

  4%|▍         | 263/6229 [02:01<1:23:35,  1.19it/s]


Variance retained for  265  components =  0.8513554668430885




---

**COMPLEMENT NAIVE BAYES**

---


In [0]:
from sklearn.naive_bayes import MultinomialNB,ComplementNB
from sklearn.model_selection import GridSearchCV
parameters = {'alpha': np.linspace(0,5, num=1000)}
tfidf_ComplementNB = ComplementNB()
clf_tfidf_ComplementNB = GridSearchCV(tfidf_ComplementNB, parameters,verbose=1,cv=10,n_jobs=-1)

In [0]:
#TRAIN
clf_tfidf_ComplementNB.fit(X_train,y_train)
#SAVE MODEL
import pickle
filename = 'tfidf_ComplementNB_model.sav'
pickle.dump(clf_tfidf_ComplementNB, open(filename, 'wb'))

Fitting 10 folds for each of 1000 candidates, totalling 10000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 202 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 1402 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done 3402 tasks      | elapsed:   38.6s
[Parallel(n_jobs=-1)]: Done 6202 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 9802 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 10000 out of 10000 | elapsed:  1.9min finished


In [0]:
#VALIDATION SCORE
best_params = clf_tfidf_ComplementNB.best_params_
val_score = clf_tfidf_ComplementNB.best_score_
print(best_params)
print("VALIDATION SCORE =",val_score)

{'alpha': 3.4034034034034035}
VALIDATION SCORE = 0.601764705882353


In [0]:
#TEST SCORE 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score  
clf_predictions = clf_tfidf_ComplementNB.predict(X_test)
print(confusion_matrix(y_test,clf_predictions))
print(classification_report(y_test,clf_predictions))
print(accuracy_score(y_test,clf_predictions))

[[43  1]
 [31  2]]
              precision    recall  f1-score   support

           0       0.58      0.98      0.73        44
           1       0.67      0.06      0.11        33

    accuracy                           0.58        77
   macro avg       0.62      0.52      0.42        77
weighted avg       0.62      0.58      0.46        77

0.5844155844155844



---

**MULTINOMIAL NAIVE BAYES**

---


In [0]:
from sklearn.naive_bayes import MultinomialNB,ComplementNB
from sklearn.model_selection import GridSearchCV
parameters = {'alpha': np.linspace(0,5, num=1000)}
tfidf_MultinomialNB = MultinomialNB()
clf_tfidf_MultinomialNB = GridSearchCV(tfidf_MultinomialNB, parameters,verbose=1,cv=10,n_jobs=-1)

In [0]:
#TRAIN
clf_tfidf_MultinomialNB.fit(X_train,y_train)
#SAVE
import pickle
filename = 'tfidf_MultinomialNB_model.sav'
pickle.dump(clf_tfidf_MultinomialNB, open(filename, 'wb'))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 10 folds for each of 1000 candidates, totalling 10000 fits


[Parallel(n_jobs=-1)]: Done 300 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 1500 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done 3500 tasks      | elapsed:   38.0s
[Parallel(n_jobs=-1)]: Done 6300 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 9900 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 10000 out of 10000 | elapsed:  1.8min finished


In [0]:
#VALIDATION SCORE
best_params = clf_tfidf_MultinomialNB.best_params_
val_score = clf_tfidf_MultinomialNB.best_score_
print(best_params)
print("VALIDATION SCORE =",val_score)

{'alpha': 0.34034034034034033}
VALIDATION SCORE = 0.6018487394957983


In [0]:
#TEST SCORE 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score  
clf_predictions = clf_tfidf_MultinomialNB.predict(X_test)
print(confusion_matrix(y_test,clf_predictions))
print(classification_report(y_test,clf_predictions))
print(accuracy_score(y_test,clf_predictions))

[[40  4]
 [29  4]]
              precision    recall  f1-score   support

           0       0.58      0.91      0.71        44
           1       0.50      0.12      0.20        33

    accuracy                           0.57        77
   macro avg       0.54      0.52      0.45        77
weighted avg       0.55      0.57      0.49        77

0.5714285714285714




---



**SVC (SVM)**

---



In [0]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':('rbf','poly','sigmoid'), 'gamma':np.arange(1e-4,1e-2,0.002),'C':range(1,100)}
tfidf_svc = SVC()
clf_tfidf_svc = GridSearchCV(tfidf_svc, parameters,verbose=1,cv=10,n_jobs=-1)

In [0]:
clf_tfidf_svc.fit(X_train_reduced,y_train)
filename = 'svc_tfidf_model.sav'
pickle.dump(clf_tfidf_svc, open(filename, 'wb'))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 10 folds for each of 1485 candidates, totalling 14850 fits


In [0]:
#VALIDATION SCORE
best_params = clf_tfidf_svc.best_params_
val_score = clf_tfidf_svc.best_score_
print(best_params)
print("VALIDATION SCORE =",val_score)

In [0]:
#TEST SCORE 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score  
clf_predictions = clf_tfidf_svc.predict(X_test_reduced)
print(confusion_matrix(y_test,clf_predictions))
print(classification_report(y_test,clf_predictions))
print(accuracy_score(y_test,clf_predictions))



---


**RANDOM FOREST**

---



In [0]:
from sklearn.ensemble import RandomForestClassifier
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 3)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 3)]
max_depth.append(None)
min_samples_split = [2, 5]
min_samples_leaf = [1, 2]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

rf = RandomForestClassifier()
clf = GridSearchCV(rf, random_grid, cv = 5, verbose=1, n_jobs = -1)

In [0]:

clf.fit(X_train_reduced,y_train)
cv_results = clf.cv_results_


In [0]:
best_params = clf.best_params_
print(clf.best_estimator_)
print(best_params)
print(clf.best_score_)
best_clf = RandomForestClassifier(n_estimators=best_params['n_estimators'], max_features=best_params['max_features'], min_samples_split=best_params['min_samples_split'], min_samples_leaf=best_params['min_samples_leaf'], bootstrap=best_params['bootstrap'])
scores = cross_val_score(best_clf, X_train_reduced, y_train, cv=5)
print(scores.mean())

In [0]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score  
clf_predictions = clf.predict(X_test_reduced)
print(confusion_matrix(y_test,clf_predictions))
print(classification_report(y_test,clf_predictions))
print(accuracy_score(y_test,clf_predictions))




---


**LOGISTIC REGRESSION**


---



In [0]:
from sklearn.linear_model import LogisticRegression
hyperparameters = {'penalty':('l1', 'l2'), 'C':np.logspace(0, 4, 100), 'fit_intercept':(True, False), 'class_weight':(None, 'balanced')}
logistic = LogisticRegression(solver = 'liblinear',max_iter=2000)
clf = GridSearchCV(logistic, hyperparameters, cv=5, verbose=1,n_jobs=-1)

In [0]:
clf.fit(X_train_reduced,y_train)
cv_results = clf.cv_results_
best_params = clf.best_params_
print(clf.best_estimator_)
print(best_params)
print(clf.best_score_)

In [0]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score  
clf_predictions = clf.predict(X_test_reduced)
print(confusion_matrix(y_test,clf_predictions))
print(classification_report(y_test,clf_predictions))
print(accuracy_score(y_test,clf_predictions))