In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# ENV SETUP AND DATA LOADING

In [0]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import svm
from sklearn.svm import SVC

In [0]:
train_data = pd.read_csv("/content/drive/My Drive/Shravani_Spring2020/paper_data_train.csv")
val_data = pd.read_csv("/content/drive/My Drive/Shravani_Spring2020/paper_data_dev.csv")
test_data = pd.read_csv("/content/drive/My Drive/Shravani_Spring2020/paper_data_test.csv")

In [0]:
train_data.head()

Unnamed: 0,paper_id,feature_extraction_encoding,tfidf_encoding,words_from_top_200_title,abstract_length,abstract_complexity,abstract_novelty,number_of_authors,research_strength_score,num_of_references,most_recent_ref_year,avg_len_of_ref_mention,num_of_recent_references,contains_githib_link,contains_appendix,number_of_sections,content_complexity,number_of_unique_words,accepted
0,304.pdf,"[0.4441553056240082, -0.0031148958951234818, 0...",[0. 0. 0. ... 0. 0. 0.],True,136,8.26502,False,3,6.0,6,2016,90.833333,6,False,True,30,3.634356,55,True
1,305.pdf,"[0.3867203891277313, -0.06180678680539131, -0....",[0. 0. 0. ... 0. 0. 0.],True,198,9.676772,False,2,6.0,26,2016,0.0,9,False,True,11,3.984653,69,True
2,306.pdf,"[0.44596463441848755, -0.02526075765490532, 0....",[0. 0. 0. ... 0. 0. 0.],True,140,7.654289,False,3,6.0,30,2016,109.228571,15,True,False,17,4.157808,52,True
3,307.pdf,"[0.3993099629878998, 0.005023505538702011, -0....",[0. 0. 0. ... 0. 0. 0.],False,163,8.715491,False,2,6.0,34,2016,42.326531,22,False,True,21,4.365226,49,True
4,308.pdf,"[0.38344770669937134, -0.032056376338005066, -...",[0. 0. 0. ... 0. 0. 0.],True,110,7.268245,False,2,6.0,13,2016,59.066667,9,False,True,10,3.705192,74,True


In [0]:
final_train = train_data
final_test = pd.concat([val_data, test_data], ignore_index=True)

Bert_train = final_train.iloc[0:, [0,1,18]]
print(Bert_train.shape)
Bert_train.head()

Bert_test = final_test.iloc[0:, [0,1,18]]
print(Bert_test.shape)
Bert_test.head()

(349, 3)
(77, 3)


Unnamed: 0,paper_id,feature_extraction_encoding,accepted
0,316.pdf,"[0.3783118724822998, -0.056981366127729416, -0...",True
1,325.pdf,"[0.40649187564849854, -0.03665460646152496, -0...",True
2,328.pdf,"[0.48881033062934875, -0.07407109439373016, -0...",True
3,340.pdf,"[0.45297518372535706, -0.05653221160173416, 0....",True
4,350.pdf,"[0.43022093176841736, -0.07531613111495972, -0...",True


# DATA CLEANING

In [0]:
 def create_array(vect_temp):
  n = len(vect_temp)
  vect_list = []

  i = 1 
  while(i<(n-1)):
    str_temp = ""
    while(i<(n-1) and vect_temp[i]!=','):
      str_temp = str_temp + vect_temp[i]
      i = i+1
    vect_list.append(float(str_temp))
    i = i+2
  return np.array(vect_list)

rows, cols = Bert_train.shape
features_array = np.random.randn(rows, 768)
for i in range(0, rows):
  features_array[i] = create_array(Bert_train.iloc[i,1])

std_scale = StandardScaler()
std_scale.fit(train)
train = pd.DataFrame(data=features_array)
train = std_scale.transform(train)

rows, cols = Bert_test.shape
features_array = np.random.randn(rows, 768)
for i in range(0, rows):
  features_array[i] = create_array(Bert_test.iloc[i,1])

test = pd.DataFrame(data=features_array)
test = std_scale.transform(test)

# PCA


In [0]:
pca = PCA(n_components=100)
pca_applied_data = pca.fit_transform(train)
var_ratio = pca.explained_variance_ratio_
i = 0
variance = 0
for ele in var_ratio:
  variance = variance + ele
  i = i+1
  if(variance>=0.85):
    break
num_components = i
print(num_components)

pca = PCA(n_components=num_components)
pca.fit(train)
pca_applied_data = pca.transform(train)
pca_applied_data = pd.DataFrame(data = pca_applied_data)
pca_applied_data['paper_id'] = Bert_train['paper_id']
pca_applied_data['accepted'] = Bert_train['accepted']
pca_applied_data = pca_applied_data[ ['paper_id'] + [ col for col in pca_applied_data.columns if col != 'paper_id' ] ]
print(pca_applied_data.shape)
train_fin = pca_applied_data
train_fin.head()

58
(349, 60)


Unnamed: 0,paper_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,accepted
0,304.pdf,-0.489618,0.406951,0.108147,-0.494787,0.312913,-0.104312,-0.131605,-0.046343,-0.05773,-0.098981,-0.089312,0.067608,0.021561,0.173068,-0.209186,0.098829,0.051589,0.04859,-0.125695,0.066237,-0.091488,0.161334,-0.005657,0.135435,-0.23999,-0.085156,0.037571,-0.012726,-0.040703,-0.05774,0.071686,0.196778,-0.029044,-0.128377,0.068986,0.078896,-0.012805,0.003871,0.069942,0.11558,0.126654,0.02043,-0.001081,-0.203653,0.077921,0.064951,0.04706,-0.060554,-0.072327,-0.034392,0.041019,0.074672,-0.016338,0.014041,-0.011635,-0.054393,-0.040229,0.144672,True
1,305.pdf,-0.37855,-0.026607,-0.106479,0.405655,-0.015566,-0.379956,0.732278,0.131338,0.063697,-0.168602,0.53073,-0.23083,0.032055,0.093967,0.024321,0.049651,-0.150732,-0.079939,0.10388,0.03491,0.07585,-0.052561,0.044229,-0.008038,-0.35632,-0.091554,-0.277797,0.152613,0.086939,0.064822,-0.074697,0.055563,0.031447,-0.071316,0.076583,-0.161799,-0.082838,0.048151,-0.104514,0.094656,-0.048806,0.133407,0.054731,0.056248,-0.180331,-0.04008,-0.015247,-0.185876,-0.008931,0.144942,0.020739,-0.011923,0.181094,0.03618,-0.067828,-0.057973,0.099567,0.103073,True
2,306.pdf,-0.162822,0.438935,0.149523,-0.355504,-0.164261,-0.29392,-0.411408,0.241587,-0.196282,-0.017565,-0.171787,-0.237339,0.035848,-0.238706,-0.032769,0.150227,0.160866,0.067308,0.089503,0.070091,-0.032328,-0.070713,0.116288,-0.173776,-0.027946,0.088962,0.095348,-0.233426,-0.002761,0.099379,-0.015706,0.126099,0.047256,0.00897,-0.033481,-0.091374,0.075107,0.008958,0.133336,-0.025575,-0.008294,-0.029904,0.051135,0.09029,0.129074,-0.069728,0.005659,0.001328,-0.005164,0.020075,0.032857,0.083835,-0.060663,-0.086545,0.020115,0.071898,0.030463,-0.073082,True
3,307.pdf,1.209099,-0.095308,-0.188723,-0.279304,-0.229785,0.600053,0.418543,0.716127,0.210858,0.122224,-0.114308,0.355733,0.129939,-0.178703,-0.067917,-0.208444,-0.000977,0.191821,0.165471,0.078925,0.071077,-0.222328,-0.237319,-0.094602,0.171737,0.108569,0.030772,0.067236,-0.115265,0.081264,-0.168775,-0.065485,-0.050407,-0.186595,0.202417,0.085907,0.146326,0.156657,0.001818,0.05999,0.170435,-0.021163,0.295658,-0.111351,0.127091,0.129352,-0.169805,-0.033447,0.177744,-0.059852,-0.142569,0.036956,0.058517,-0.100049,-0.031915,0.01931,-0.058427,0.008692,True
4,308.pdf,0.206198,0.407238,-0.162538,-0.101529,-0.472674,0.712483,0.013673,-0.393989,-0.174554,0.291749,-0.177522,0.424534,0.016129,-0.188306,0.083782,-0.192633,-0.126796,0.188213,-0.161833,0.089729,-0.076106,0.184673,0.15944,0.122037,0.147071,0.049371,-0.02088,0.086082,-0.008078,0.043048,0.078682,0.074466,0.115713,0.181142,0.118057,-0.049975,-0.147098,-0.082858,0.074717,-0.033091,-0.012447,0.026641,-0.017419,-0.063066,-0.04243,0.028436,0.013165,-0.113549,0.004281,0.108712,-0.17886,0.053894,0.044537,0.006363,-0.17257,-0.090322,-0.141223,0.194643,True


In [0]:
pca_applied_data = pca.transform(test)
pca_applied_data = pd.DataFrame(data = pca_applied_data)
pca_applied_data['paper_id'] = Bert_test['paper_id']
pca_applied_data['accepted'] = Bert_test['accepted']
pca_applied_data = pca_applied_data[ ['paper_id'] + [ col for col in pca_applied_data.columns if col != 'paper_id' ] ]
print(pca_applied_data.shape)
test_fin = pca_applied_data
test_fin.head()

(77, 60)


Unnamed: 0,paper_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,accepted
0,316.pdf,1.435818,-0.348435,0.619462,0.281788,-0.339891,0.421263,-0.422685,-0.060561,0.184359,-0.099122,-0.063248,0.02855,-0.074998,0.284707,-0.146173,-0.314911,-0.024733,0.090176,-0.123683,0.26238,-0.162963,-0.266062,-0.160919,-0.210415,-0.302885,-0.163493,-0.251652,-0.12283,0.10187,0.110957,-0.010847,0.189428,-0.048091,0.048383,-0.007125,-0.194698,0.051379,0.082332,-0.23129,-0.049693,0.096812,-0.0224,0.067022,0.029492,-0.118901,0.093326,-0.035285,-0.024858,0.025278,0.074652,-0.015469,-0.009315,-0.202641,0.150329,-0.0444,0.092237,0.075851,-0.013066,True
1,325.pdf,-1.071523,0.556425,-0.337098,0.403767,-0.197794,0.226238,-0.14066,0.204592,-0.002709,0.008904,0.018752,-0.095495,-0.083478,-0.193118,-0.16735,-0.07219,-0.129687,0.232162,-0.100899,-0.053723,-0.032515,-0.145018,0.014525,0.070099,-0.251353,0.169115,0.093303,-0.115623,0.035273,0.007925,0.082842,-0.134529,0.135946,0.237711,-0.113921,0.08194,0.018247,0.052062,0.008916,-0.172794,0.000962,-0.052828,-0.002962,0.031458,-0.022577,0.004294,-0.034554,-0.041772,-0.1517,0.045477,0.036771,-0.031355,-0.075741,-0.017403,0.013129,0.050092,0.002623,-0.127876,True
2,328.pdf,0.615214,0.456142,0.179919,0.0885,-0.578029,-0.848075,0.585798,-0.229542,-0.045906,0.184843,0.044462,0.088623,0.12532,0.191036,0.154963,0.116577,-0.076539,-0.135086,-0.079647,0.276673,0.088867,-0.108912,0.175123,0.264395,0.007838,0.112518,-0.190353,-0.127049,-0.163564,-0.092677,-0.045795,0.068837,-0.023314,-0.115056,0.003244,-0.006172,0.140962,0.061931,-0.002468,-0.086841,-0.10721,-0.14609,-0.027415,-0.008996,-0.095967,-0.086174,0.01498,0.049903,-0.049461,0.03438,-0.072875,-0.015732,0.042267,-0.057639,0.047918,-0.109804,0.178021,0.095584,True
3,340.pdf,-0.807391,0.126312,-0.332372,0.458134,0.459138,-0.056564,0.459982,-0.019458,0.207174,-0.039437,0.054318,0.007375,0.073552,-0.014639,0.167613,-0.280172,0.005029,0.120076,0.06022,0.107789,0.02007,-0.1251,-0.096756,-0.147032,0.136059,0.096147,-0.015024,-0.017031,-0.108272,0.06801,0.001295,0.070115,0.073472,0.004593,0.047053,0.039435,-0.130036,0.01457,0.008239,-0.03497,0.049245,-0.092439,0.0025,-0.079677,-0.068763,-0.0164,-0.05777,0.000259,-0.200116,0.029918,-0.043919,0.058498,0.008581,-0.038085,0.101888,-0.004729,-0.006118,-0.113358,True
4,350.pdf,0.304558,0.559218,0.173473,-0.000855,-0.008936,-0.290413,0.38411,-0.200003,0.06993,-0.012058,-0.066925,0.045376,0.119427,0.085016,0.014693,0.178209,0.11698,0.084341,0.051439,-0.072655,0.002341,0.083049,0.010268,-0.008024,-0.073841,-0.16128,0.082888,0.029207,-0.030913,-0.034564,0.12263,-0.106713,0.045064,-0.081024,-0.085938,-0.153997,-0.019388,0.059669,0.082313,0.031585,0.010518,0.082012,0.005905,-0.036791,0.017296,0.045608,-0.162942,0.126633,0.067827,-0.064485,-0.102532,-0.034983,-0.015168,0.027394,0.066582,-0.069107,0.018323,0.020452,True


# TRAIN-TEST SPLIT

In [0]:
x_indices = []
for i in range(1,59):
  x_indices.append(i)

Xtrain = train_fin.iloc[0:,x_indices]
ytrain = train_fin.iloc[0:,59]

Xtest = test_fin.iloc[0:,x_indices]
ytest = test_fin.iloc[0:,59]

## SVM

In [0]:
parameters = {'kernel':('rbf', 'poly', 'sigmoid'), 'C':range(1,100)}
svc = SVC()
svm_clf = GridSearchCV(svc, parameters, verbose=1, cv=10)
svm_clf.fit(Xtrain, ytrain)

Fitting 10 folds for each of 297 candidates, totalling 2970 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2970 out of 2970 | elapsed:   45.1s finished


GridSearchCV(cv=10, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': range(1, 100),
                         'kernel': ('rbf', 'poly', 'sigmoid')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [0]:
print(svm_clf.best_estimator_)
print(svm_clf.best_params_)
print(svm_clf.best_score_)

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
{'C': 1, 'kernel': 'poly'}
0.6047058823529412


In [0]:
scores = cross_val_score(svm_clf, Xtrain, ytrain, cv=10)

In [0]:
scores.mean()

0.5701680672268907

In [0]:
clf_predictions = svm_clf.predict(Xtest)
print(confusion_matrix(ytest,clf_predictions))
print(classification_report(ytest,clf_predictions))

[[41  3]
 [28  5]]
              precision    recall  f1-score   support

       False       0.59      0.93      0.73        44
        True       0.62      0.15      0.24        33

    accuracy                           0.60        77
   macro avg       0.61      0.54      0.48        77
weighted avg       0.61      0.60      0.52        77



**ENVIRONMENT SETUP**

In [0]:
pip install prince

Collecting prince
  Downloading https://files.pythonhosted.org/packages/51/f4/8de7003b86351a0e32e29ca2bbbbbf58e311b09f9286e83e638d437aee6d/prince-0.7.0-py3-none-any.whl
Installing collected packages: prince
Successfully installed prince-0.7.0


In [0]:
import pandas as pd
import numpy as np
import prince
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV,cross_val_score
import pickle
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA
from tqdm import tqdm

**DATA SETUP**

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
train_data_df = pd.read_pickle("/content/drive/My Drive/ML/paper_data_train.pkl")
train_data_features = train_data_df.drop(columns=['paper_id','feature_extraction_encoding','tfidf_encoding','accepted'])
test_data_df = pd.read_pickle("/content/drive/My Drive/ML/paper_data_test.pkl")
test_data_features = test_data_df.drop(columns=['paper_id','feature_extraction_encoding','tfidf_encoding','accepted'])

In [0]:
#TRAIN DATA
X_train = np.vstack(train_data_df['feature_extraction_encoding'].values.tolist())
y_train = np.asarray([1 if item==True else 0 for item in list(train_data_df['accepted'])])

#TEST DATA
X_test = np.vstack(test_data_df['feature_extraction_encoding'].values.tolist())
y_test = np.asarray([1 if item==True else 0 for item in list(test_data_df['accepted'])])

**DIMENSION REDUCTION**

In [0]:
bert_vector_size = X_train.shape[1]+1
for i in tqdm(range(2,bert_vector_size)):
    pca = prince.PCA(n_components=i)
    pca = pca.fit(X_train)
    retained_variance = sum(pca.explained_inertia_)
    if retained_variance > 0.95:
        break
print("\nVariance retained for ",i," components = ",retained_variance)
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)


 17%|█▋        | 134/767 [00:05<00:26, 23.45it/s]



Variance retained for  137  components =  0.9502651618209814




---
**LOGISTIC REGRESSION**

---




In [0]:
from sklearn.linear_model import LogisticRegression
parameters = {'solver':['liblinear'],'penalty':('l1', 'l2', 'elasticnet'), 'C':range(1,50), 'fit_intercept':(True, False), 'class_weight':(None, 'balanced')}
bert_lr = LogisticRegression(max_iter=700,warm_start=True)
clf_bert_lr = GridSearchCV(bert_lr, param_grid=parameters, verbose=1, cv=3, n_jobs=-1)
clf_bert_lr.fit(X_train_reduced, y_train)

Fitting 3 folds for each of 588 candidates, totalling 1764 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 1416 tasks      | elapsed:   58.9s
[Parallel(n_jobs=-1)]: Done 1764 out of 1764 | elapsed:  1.3min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=700, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=True),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': range(1, 50), 'class_weight': (None, 'balanced'),
                         'fit_intercept': (True, False),
                         'penalty': ('l1', 'l2', 'elasticnet'),
                         'solver': ['liblinear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, 

In [0]:
filename = 'bert_lr_model.sav'
pickle.dump(clf_bert_lr, open(filename, 'wb'))

In [0]:
best_params = clf_bert_lr.best_params_
val_score = clf_bert_lr.best_score_
print(best_params)
print("VALIDATION SCORE =", val_score)

{'C': 43, 'class_weight': None, 'fit_intercept': True, 'penalty': 'l1', 'solver': 'liblinear'}
VALIDATION SCORE = 0.5644464092739955


In [0]:
clf_predictions = clf_bert_lr.predict(X_test_reduced)
print(confusion_matrix(y_test,clf_predictions))
print(classification_report(y_test,clf_predictions))

[[23 21]
 [20 13]]
              precision    recall  f1-score   support

           0       0.53      0.52      0.53        44
           1       0.38      0.39      0.39        33

    accuracy                           0.47        77
   macro avg       0.46      0.46      0.46        77
weighted avg       0.47      0.47      0.47        77





---
**RANDOM FOREST**

---




In [0]:
from sklearn.ensemble import RandomForestClassifier
n_estimators = [30, 40, 50, 60, 100]
max_depth = [15, 20, 25, 30]
min_samples_split = [2, 5, 10, 15, 20]
min_samples_leaf = [1, 2, 5]
hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

bert_rf = RandomForestClassifier()
clf_bert_rf = GridSearchCV(bert_rf, hyperF, cv = 5, verbose = 1, n_jobs = -1)
clf_bert_rf.fit(X_train_reduced, y_train)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   31.2s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed: 13.9min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed: 16.7min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [0]:
filename = 'bert_rf_model.sav'
pickle.dump(clf_bert_lr, open(filename, 'wb'))

In [0]:
best_params = clf_bert_rf.best_params_
val_score = clf_bert_rf.best_score_
print(best_params)
print("VALIDATION SCORE =", val_score)

{'max_depth': 25, 'min_samples_leaf': 5, 'min_samples_split': 15, 'n_estimators': 50}
VALIDATION SCORE = 0.6247619047619047


In [0]:
clf_predictions = clf_bert_rf.predict(X_test_reduced)
print(confusion_matrix(y_test,clf_predictions))
print(classification_report(y_test,clf_predictions))

[[42  2]
 [28  5]]
              precision    recall  f1-score   support

           0       0.60      0.95      0.74        44
           1       0.71      0.15      0.25        33

    accuracy                           0.61        77
   macro avg       0.66      0.55      0.49        77
weighted avg       0.65      0.61      0.53        77

