In [0]:
import pandas as pd
import numpy as np
import pickle
import gc
import warnings
warnings.filterwarnings('ignore')

In [0]:
path_1 = "/content/drive/My Drive/BERT_Anadi/Data Produced/bert_train_embed_1000_docs.pickle"
path_2 = "/content/drive/My Drive/BERT_Anadi/Data Produced/bert_gs_embed_all.pickle"


data = pd.read_pickle((path_1))
# data = data.drop(columns = ['doc_id'])
X_pred = (pd.read_pickle(path_2))

# gc.collect()

In [19]:
data.shape

(130759, 1026)

In [20]:
X_pred.shape

(9092, 1025)

In [21]:
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.model_selection import train_test_split                                      #to split the dataset for training and testing
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

target = data.importance         
data.drop(['importance', 'doc_id'], axis=1, inplace=True)     # taking the training data features
                                 
# del data

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.3, random_state = 32) # in this our main data is split into train and test
X_pred.drop(columns = ['doc_id'], inplace = True)
del data, target
gc.collect()

15741

In [9]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(91531, 1024) (91531,) (39228, 1024) (39228,)


# Start Model Fitting From Here

### Base Model - The Best Model!

In [10]:
from sklearn.ensemble import RandomForestClassifier

base_model = RandomForestClassifier()
base_model.fit(X_train, y_train)
print(f1_score(base_model.predict(X_test), y_test))

0.11302211302211301


In [11]:
cutoff = np.arange(0.005, 0.995, 0.005)
f1 = []
acc = []

pred = (base_model.predict_proba(X_test))[:,1]

for i in cutoff:
    f1.append(f1_score(pred >= i, y_test))
    acc.append(accuracy_score(pred >= i, y_test))

# print(f1)

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x = cutoff, y = f1, mode='lines', name='f1'))
fig.add_trace(go.Scatter(x = cutoff, y = acc, mode='lines', name='accuracy'))
fig.show()

In [0]:
GS_sent = pd.read_pickle("/content/drive/My Drive/BERT_Anadi/Data Produced/gs_sent.pickle")
GS_sent.columns = ["Sents", "doc_id"]


predictions = base_model.predict(X_pred)
probability_of_predictions = base_model.predict_proba(X_pred)

# this command gives the sentences that are detected important on Gold Standard Data
result = pd.DataFrame()

result = (GS_sent.iloc[predictions == 1])
result['probabilities'] = probability_of_predictions[predictions == 1][:,1]
result['doc_id'] = result['doc_id']

# length filter of output results
length = []
for i in np.array(result['Sents']):
  length.append(len(i)>=0)

result['Sents'] = (result.iloc[length])['Sents']
result.dropna(inplace = True)

result

### Parameter Tuning Mechanism

In [16]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'max_features': max_features,
               'bootstrap': bootstrap}
print(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 6, scoring = 'f1', cv = 3, verbose=1, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

{'max_features': ['auto', 'sqrt'], 'bootstrap': [True, False]}
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 66.6min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [13]:
rf_random.best_estimator_.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': 52,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [14]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(min_samples_split = 2, min_samples_leaf = 2, max_leaf_nodes = 52, bootstrap= True, max_features= 'auto')

import time

s = time.time()
model.fit(X_train, y_train)
e = time.time()

print("Model trained in", (e-s)/60,"min")
print(model.score(X_test, y_test))

pred = model.predict(X_test)
print(f1_score(pred, y_test))

Model trained in 2.760242978731791 min
0.7905832568573468
0.02237296203736761


In [15]:
cutoff = np.arange(0.005, 0.995, 0.005)
f1 = []
acc = []

pred = (model.predict_proba(X_test))[:,1]

for i in cutoff:
    f1.append(f1_score(pred >= i, y_test))
    acc.append(accuracy_score(pred >= i, y_test))

# print(f1)

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x = cutoff, y = f1, mode='lines', name='f1'))
fig.add_trace(go.Scatter(x = cutoff, y = acc, mode='lines', name='accuracy'))
fig.show()

# View Results

In [0]:
GS_sent = pd.read_pickle("/content/drive/My Drive/BERT_Anadi/Data Produced/gs_sent.pickle")
GS_sent.columns = ["Sents", "doc_id"]


predictions = base_model.predict(X_pred)
probability_of_predictions = base_model.predict_proba(X_pred)

# this command gives the sentences that are detected important on Gold Standard Data
result = pd.DataFrame()

result = (GS_sent.iloc[predictions == 1])
result['probabilities'] = probability_of_predictions[predictions == 1][:,1]
result['doc_id'] = result['doc_id']

# length filter of output results
length = []
for i in np.array(result['Sents']):
  length.append(len(i)>=0)

result['Sents'] = (result.iloc[length])['Sents']
result.dropna(inplace = True)

result

Unnamed: 0,Sents,doc_id,probabilities
39,actual income included gross assets case fores...,0.0,0.548832
42,slab system calculation compensation act provi...,0.0,0.567453
43,compensation money calculated purchasing power...,0.0,0.777166
79,statutory scheme compensation forest lands con...,0.0,0.737599
80,s. relevant may set gross asset used reference...,0.0,0.530287
...,...,...,...
8710,enough eyes courts equity entitle relief way i...,47.0,0.536824
8936,high court construed court waived limitation f...,49.0,0.509714
8951,high court therefore justified holding court e...,49.0,0.545343
8986,rules high court matter heard disposed single ...,49.0,0.566005
