In [2]:
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [5]:
import pickle

In [6]:
data = pd.read_csv('term_unique_13K_all_in_one_NEW_CLEANED_FINAL_12_columns.csv')

# (Optional) Embedding creation

In [None]:
!pip install sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer

In [7]:
data

Unnamed: 0,terms,definitions,source,assigned_readability,flesch_reading_ease,flesch_kincaid_grade,smog_index,coleman_liau_index,automated_readability_index,dale_chall_readability_score,linsear_write_formula,gunning_fog
0,adjusted present value (apv),net present value of an asset if financed sole...,prin,0,59.64,9.9,0.0,9.40,10.5,10.16,12.000000,12.00
1,agency costs,"costs that arise when an agent (e.g., a manage...",prin,0,66.07,9.5,0.0,7.78,12.3,10.47,15.000000,14.25
2,annual percentage rate (apr),"the interest rate per period (e.g., per month)...",prin,0,62.68,8.7,0.0,6.67,8.4,10.98,12.500000,16.21
3,annuity,investment that produces a level stream of cas...,prin,0,56.25,9.1,0.0,9.56,8.7,9.64,10.500000,14.00
4,annuity due,annuity whose payments occur at the start of e...,prin,0,61.33,7.2,0.0,8.50,6.2,10.45,6.000000,12.00
...,...,...,...,...,...,...,...,...,...,...,...,...
13107,ZERO COUPON SWAP,An OVER-THE-COUNTER SWAP involvingtheexchange ...,palgrave,0,35.78,12.9,0.0,13.46,13.4,10.10,13.250000,12.81
13108,ZERO COUPON YIELD CURVE,A YIELD CURVE representing DISCOUNTRATES acros...,palgrave,0,52.19,10.7,14.1,15.66,15.5,10.95,13.166667,15.32
13109,ZERO MINUS TICK,Sale of a SECURITY at the same price as the la...,palgrave,0,66.74,7.2,10.5,6.83,6.2,9.14,7.166667,9.30
13110,ZERO PLUS TICK,Sale of a SECURITY at the same price as the la...,palgrave,0,66.74,7.2,10.5,6.54,6.0,8.73,7.166667,9.30


In [None]:
model = SentenceTransformer('ProsusAI/finbert')

In [None]:
corpus = list(data['definitions'].fillna('').values)
finbert_embed = model.encode(corpus)

In [None]:
finbert_df = pd.DataFrame(finbert_embed)

In [None]:
data[['assigned_readability']].to_csv('assigned_redability_scores_13K.csv', index = False)

In [None]:
finbert_df.to_pickle('finbert_embeddings_of_definitions_13K.pickle')

# Loading Embeddings and Target variables

In [None]:
X = pd.read_pickle('finbert_embeddings_of_definitions_13K.pickle') # ../embeddings_and_labels/finbert_embeddings_of_definitions_13K.pickle
y = pd.read_csv('assigned_redability_scores_13K.csv') # ../embeddings_and_labels/assigned_redability_scores_13K.csv

In [None]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767
0,0.184883,0.963402,-0.113459,-0.238247,0.228393,-0.657534,-0.159602,0.790045,0.284261,0.270735,-0.286424,-0.099882,0.328200,0.009324,-0.380326,-0.026808,0.069056,0.006242,0.523684,-0.187701,0.053000,0.666077,-0.168280,0.353479,0.666040,0.335737,0.537289,0.456613,0.456570,0.385766,0.997439,0.673123,0.247810,-0.063953,0.479946,-0.552191,0.294324,-0.451335,-0.681898,-0.144674,...,0.183947,0.322111,0.033319,-0.246759,-0.593302,-0.076825,0.052730,0.695216,-0.372869,0.505815,0.054873,0.523658,-0.045578,0.181468,0.247852,-1.099165,-0.476844,-0.227893,-0.545093,0.497437,0.383686,0.323929,0.200648,-0.122827,1.422405,0.089364,-0.189113,0.672834,0.370699,0.155610,-0.017009,0.330071,0.507158,-0.688259,-0.477652,0.103034,0.089814,-0.584482,-0.567774,0.718144
1,-0.242829,0.680479,-0.060378,-0.196382,0.077879,-0.413284,0.127389,0.917318,0.083452,0.082668,-0.175501,-0.210223,0.084094,0.133842,-0.338829,0.311936,0.247770,0.237924,-0.074485,-0.228161,0.099722,0.606247,-0.069841,0.223901,0.511090,-0.065129,0.278169,0.357795,0.449496,0.193122,0.754209,0.572877,0.080695,-0.116159,0.731938,-0.115331,0.304042,-0.352987,-0.439339,0.063462,...,0.108002,0.097358,0.200539,-0.061815,-0.528963,-0.043847,0.509497,1.002764,-0.529102,0.362646,0.103270,0.741056,-0.301473,0.170660,0.608756,-0.928778,-0.752666,-0.186905,-0.182519,0.252048,0.804839,0.067022,0.025321,-0.058781,1.102862,-0.234377,0.048481,0.331052,0.742415,0.054227,0.235442,0.050984,0.081753,-0.897964,-0.700977,0.100685,0.469065,-0.542681,-0.172880,0.511789
2,-0.161858,0.389614,0.371562,-0.274890,0.325236,-0.289743,0.003597,1.158318,-0.097860,0.210648,-0.167418,-0.518324,-0.059521,0.435780,-0.046144,0.310456,0.030572,-0.168012,-0.114892,0.249773,0.218560,0.885349,-0.072886,-0.064065,0.375222,-0.304826,0.412837,0.073889,0.172669,0.132819,0.479669,0.439182,0.105781,0.230065,1.026393,-0.492289,0.298767,-0.784319,-0.400964,-0.220388,...,0.376578,0.147067,-0.331118,-0.043380,-0.343244,-0.291331,0.607986,0.528104,-0.303974,0.164009,-0.324436,0.730373,0.266155,-0.317194,0.442139,-0.696930,-0.245127,0.047178,-0.480287,0.301644,0.549728,0.210458,-0.113074,-0.305841,1.193745,0.240389,0.010169,0.330152,0.250107,-0.284011,0.390322,0.151351,-0.102673,-0.592789,-0.484857,-0.153056,0.080858,-0.228636,-0.036314,0.441195
3,-0.023223,0.445641,0.016909,0.166205,0.483727,-0.435553,-0.254949,0.528690,0.017388,-0.342903,-0.492616,-0.134340,0.244819,-0.538804,0.032028,0.232134,0.516510,-0.143379,0.043878,-0.035306,0.060544,0.005003,-0.286677,0.296558,0.359438,0.037744,0.107464,0.930401,0.107573,0.246667,0.488973,0.439756,-0.088450,0.113551,0.661401,-0.683219,0.605379,0.116923,-0.405467,-0.271648,...,0.517980,0.002232,0.378510,0.190717,-0.363660,-0.418434,-0.018251,0.566143,-0.288505,0.473909,-0.398661,-0.069614,-0.389753,0.284137,0.292468,-1.221428,-0.600689,-0.273175,-0.223456,0.279699,0.395662,0.363241,-0.052759,-0.270714,1.488481,-0.375759,0.454230,0.472612,0.179239,0.090694,-0.279148,-0.179137,0.372667,-0.202656,-0.557891,-0.041573,0.231946,-0.785041,-0.764352,0.203464
4,-0.277175,0.317595,0.411673,-0.258846,0.333753,-0.333010,0.295542,0.999050,0.254497,0.048463,-0.332334,-0.025842,-0.032343,0.042468,-0.240808,0.083356,-0.271210,0.050999,-0.222975,-0.132725,0.352255,0.304582,-0.266617,0.159570,0.258637,-0.073498,0.257358,0.251949,0.180690,0.098515,0.552765,0.454875,0.368944,0.005718,0.975146,-0.269984,0.036238,-0.362672,-0.281072,-0.116108,...,0.570795,0.286619,-0.001280,-0.042452,-0.508884,-0.465914,0.565026,0.553120,0.057060,0.602830,-0.051018,0.374319,0.307036,-0.567812,1.254878,-1.321846,-0.326154,0.141492,-0.532748,0.486383,0.493066,0.212075,0.441086,-0.105270,1.322911,-0.183309,-0.207935,0.183314,0.369047,-0.629212,0.722259,0.252352,-0.185299,-0.841254,-0.561818,0.155261,0.255316,-0.050675,-0.243014,0.630817
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13107,0.021163,0.331648,0.097957,-0.044651,0.397460,-0.246158,-0.148983,0.647735,0.221966,0.252533,-0.309565,0.053896,0.272400,-0.049390,-0.344258,-0.143515,0.173908,-0.160543,-0.077118,-0.066772,0.332805,0.684235,-0.076209,0.212857,0.100115,0.020390,0.154811,0.087563,0.255231,0.289032,0.723061,0.249036,0.416077,-0.039980,0.725918,-0.128038,0.300892,-0.413751,-0.416851,-0.387010,...,0.303299,0.388174,0.229467,-0.232867,-0.405656,-0.292352,0.434300,0.710381,-0.284060,0.633546,-0.094310,0.134642,0.132167,-0.142419,0.540762,-1.019609,-0.457518,0.006361,-0.427495,0.244350,0.639600,0.326619,0.156354,-0.278068,1.282086,-0.135513,-0.128157,0.548470,-0.201803,-0.485339,0.108834,0.040428,-0.086953,-0.695931,-0.450496,0.051134,-0.157590,0.006986,-0.434256,0.656598
13108,-0.187862,0.212622,0.138334,-0.080471,0.348631,-0.103958,-0.212263,0.367935,0.567376,-0.186444,-0.156049,-0.072839,0.206802,-0.051871,0.019011,0.177611,0.187596,-0.182136,0.123549,0.105473,0.085287,0.679156,-0.200505,0.342952,0.384540,-0.111028,0.122931,0.068794,0.076331,0.290271,0.346695,0.318418,0.223632,-0.490634,0.525421,-0.412621,0.048449,-0.150551,-0.292101,-0.218417,...,-0.364646,0.083237,-0.023131,0.138374,-0.364378,-0.109004,0.355341,0.472508,-0.570895,0.178758,0.089382,0.143989,-0.218513,0.203486,0.305163,-0.807924,-0.513458,0.148365,-0.264189,0.399860,0.355850,0.414548,-0.209196,0.035620,1.012324,0.273522,-0.284049,0.329157,0.049142,-0.380106,0.247228,0.249030,-0.229585,-0.574909,-0.640928,-0.235551,-0.128866,-0.341619,-0.360839,0.401239
13109,0.232175,0.139506,0.006346,-0.115720,0.194042,0.065195,-0.062542,0.751114,0.366908,0.065513,-0.188477,-0.474425,-0.081461,0.201449,-0.091342,0.088053,-0.011597,-0.223374,0.220098,0.060098,0.368565,1.031158,-0.057572,0.111284,0.236639,0.080211,0.292216,-0.117506,-0.125316,0.305051,0.520381,0.357687,0.190173,-0.141151,0.374471,-0.417710,0.412437,-0.091998,-0.350800,-0.353843,...,-0.290798,-0.148217,-0.220408,0.001389,-0.426606,0.009722,0.348430,0.718493,-0.357420,0.194281,-0.095499,0.327751,0.145683,-0.029311,0.078217,-1.093480,-0.368556,0.150657,-0.184456,0.331329,0.743703,0.792232,-0.022615,-0.150573,1.296555,0.004470,-0.129560,0.226173,0.138514,-0.270262,0.528386,0.255743,-0.103669,-0.456029,-0.271310,-0.056394,0.173910,-0.084195,-0.031481,0.496607
13110,0.203303,0.017677,0.105035,-0.009308,0.176142,0.125231,-0.034156,0.560181,0.372004,0.062158,-0.220316,-0.371438,-0.090994,0.165835,-0.068784,0.154269,0.104465,-0.268656,0.148881,0.117132,0.439617,0.922795,0.011157,0.117145,0.225179,0.109822,0.298109,-0.099077,-0.210599,0.371557,0.494324,0.380823,0.002952,-0.123375,0.298069,-0.338508,0.419182,-0.082242,-0.381123,-0.380525,...,-0.282394,-0.172521,-0.210246,-0.162646,-0.301045,-0.012515,0.270225,0.633970,-0.429181,0.254226,-0.094158,0.285457,0.215410,-0.096119,-0.033630,-1.218847,-0.273695,0.010020,-0.281258,0.213125,0.812164,0.888290,-0.106968,-0.099163,1.266647,-0.062871,-0.174459,0.139911,0.108533,-0.358623,0.430703,0.350374,0.032217,-0.429955,-0.367603,-0.040143,0.174703,-0.095335,0.094141,0.473806


In [None]:
y

Unnamed: 0,assigned_readability
0,0
1,0
2,0
3,0
4,0
...,...
13107,0
13108,0
13109,0
13110,0


In [9]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.33, random_state=42)

# (Optional) Using Traditional Readability scores

In [10]:
numeric_columns = ['flesch_reading_ease', 'flesch_kincaid_grade', 'smog_index',
       'coleman_liau_index', 'automated_readability_index',
       'dale_chall_readability_score',
       'linsear_write_formula', 'gunning_fog']
val_df = data[data.index.isin(valid_y.index)]     
for col in numeric_columns:
  auroc = roc_auc_score(val_df['assigned_readability'].values, val_df[col].values)
  print(col,auroc,"\n")


flesch_reading_ease 0.492175356047557 

flesch_kincaid_grade 0.5952659153608089 

smog_index 0.6407178848254422 

coleman_liau_index 0.5362828477502836 

automated_readability_index 0.6135265906608411 

dale_chall_readability_score 0.409326030332311 

linsear_write_formula 0.6723878905320058 

gunning_fog 0.5561075766009886 



# Models

In [None]:
def model(clf, train_X, train_y, valid_X, valid_y):
    clf.fit(train_X, train_y)
    pred_tr = clf.predict(train_X)
    pred_valid = clf.predict(valid_X)
    pred_tr_prob = clf.predict_proba(train_X)[:,1]
    pred_valid_prob = clf.predict_proba(valid_X)[:,1]
    print("\nTraining F1:{}".format(f1_score(train_y, pred_tr, average="weighted")))
    print("Training Confusion Matrix \n{}".format(confusion_matrix(train_y, pred_tr)))
    print("Classification Report Train: \n{}".format(classification_report(train_y, pred_tr)))
    print("AUC Train", roc_auc_score(train_y, pred_tr_prob))

    print(
        "\nValidation F1:{}".format(f1_score(valid_y, pred_valid, average="weighted"))
    )
    print(
        "Validation Confusion Matrix \n{}".format(confusion_matrix(valid_y, pred_valid))
    )
    print(
        "Classification Report: \n{}".format(classification_report(valid_y, pred_valid))
    )
    print("AUC Valid", roc_auc_score(valid_y, pred_valid_prob))

## Logistic Regression

In [None]:
lr_clf = LogisticRegression(solver="lbfgs", n_jobs=-1)
model(lr_clf, train_X, train_y, valid_X, valid_y)

  y = column_or_1d(y, warn=True)



Training F1:0.9456293084590777
Training Confusion Matrix 
[[3458  219]
 [ 259 4849]]
Classification Report Train: 
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      3677
           1       0.96      0.95      0.95      5108

    accuracy                           0.95      8785
   macro avg       0.94      0.94      0.94      8785
weighted avg       0.95      0.95      0.95      8785

AUC Train 0.9868565661078867

Validation F1:0.9198866139528612
Validation Confusion Matrix 
[[1672  159]
 [ 188 2308]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.90      0.91      0.91      1831
           1       0.94      0.92      0.93      2496

    accuracy                           0.92      4327
   macro avg       0.92      0.92      0.92      4327
weighted avg       0.92      0.92      0.92      4327

AUC Valid 0.9692768943690572


In [None]:
with open('logistic_regression_classifier.pkl', 'wb') as f:
    pickle.dump(lr_clf, f)

## Random Forest

In [None]:
rf_clf = RandomForestClassifier(max_depth=10, random_state=0, n_estimators=100, class_weight="balanced")
model(rf_clf, train_X, train_y, valid_X, valid_y)

  



Training F1:0.9849709233897832
Training Confusion Matrix 
[[3605   72]
 [  60 5048]]
Classification Report Train: 
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      3677
           1       0.99      0.99      0.99      5108

    accuracy                           0.98      8785
   macro avg       0.98      0.98      0.98      8785
weighted avg       0.98      0.98      0.98      8785

AUC Train 0.9987011314380125

Validation F1:0.8698083596755274
Validation Confusion Matrix 
[[1501  330]
 [ 231 2265]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.87      0.82      0.84      1831
           1       0.87      0.91      0.89      2496

    accuracy                           0.87      4327
   macro avg       0.87      0.86      0.87      4327
weighted avg       0.87      0.87      0.87      4327

AUC Valid 0.9435107094343851


In [None]:
with open('random_forest_classifier.pkl', 'wb') as f:
    pickle.dump(rf_clf, f)

## GBM

In [None]:
gbm_clf = GradientBoostingClassifier(max_depth=10, random_state=0, n_estimators=100)
model(gbm_clf, train_X, train_y, valid_X, valid_y)

  y = column_or_1d(y, warn=True)



Training F1:0.9998861717738574
Training Confusion Matrix 
[[3677    0]
 [   1 5107]]
Classification Report Train: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3677
           1       1.00      1.00      1.00      5108

    accuracy                           1.00      8785
   macro avg       1.00      1.00      1.00      8785
weighted avg       1.00      1.00      1.00      8785

AUC Train 0.9999999733789313

Validation F1:0.8829240299638225
Validation Confusion Matrix 
[[1540  291]
 [ 214 2282]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.88      0.84      0.86      1831
           1       0.89      0.91      0.90      2496

    accuracy                           0.88      4327
   macro avg       0.88      0.88      0.88      4327
weighted avg       0.88      0.88      0.88      4327

AUC Valid 0.9523591651612543


In [None]:
with open('gbm_classifier.pkl', 'wb') as f:
    pickle.dump(gbm_clf, f)