In [0]:
import pandas as pd
import numpy as np
import pickle
import gc
import warnings
warnings.filterwarnings('ignore')

In [0]:
path_1 = "/content/drive/My Drive/Word2Vec_Modelling_Srijan/Data Produced/word2vec_result_final_500.pickle"
path_2 = "/content/drive/My Drive/Word2Vec_Modelling_Srijan/Data Produced/gs_result_final_500.pickle"


data = pd.read_pickle((path_1))
# data = data.drop(columns = ['doc_id'])
X_pred = (pd.read_pickle(path_2))

# gc.collect()

In [14]:
data.shape

(642290, 502)

In [15]:
X_pred.shape

(9092, 501)

In [16]:
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.model_selection import train_test_split                                      #to split the dataset for training and testing
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

target = data.importance         
data.drop(['importance', 'doc_id'], axis=1, inplace=True)     # taking the training data features
                                 
# del data

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.3, random_state = 32) # in this our main data is split into train and test

del data, target
gc.collect()

1113

In [17]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(449603, 500) (449603,) (192687, 500) (192687,)


### Base Model

In [18]:
base_model = RandomForestClassifier()
base_model.fit(X_train, y_train)
print(f1_score(base_model.predict(X_test), y_test))

0.2461741424802111


In [19]:
cutoff = np.arange(0.005, 0.995, 0.005)
f1 = []
acc = []

pred = (base_model.predict_proba(X_test))[:,1]

for i in cutoff:
    f1.append(f1_score(pred >= i, y_test))
    acc.append(accuracy_score(pred >= i, y_test))

# print(f1)

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x = cutoff, y = f1, mode='lines', name='f1'))
fig.add_trace(go.Scatter(x = cutoff, y = acc, mode='lines', name='accuracy'))
fig.show()

### Parameter Tuning

In [7]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               "max_leaf_nodes": np.arange(2,60,10)}
print(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, scoring = 'f1', cv = 3, verbose=1, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

{'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'max_leaf_nodes': array([ 2, 12, 22, 32, 42, 52])}
Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 154.9min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [8]:
rf_random.best_estimator_.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': 52,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 4,
 'min_samples_split': 5,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [9]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(min_samples_split = 5, min_samples_leaf = 4, max_leaf_nodes = 52)

import time

s = time.time()
model.fit(X_train, y_train)
e = time.time()

print("Model trained in", (e-s)/60,"min")
print(model.score(X_test, y_test))

pred = model.predict(X_test)
print(f1_score(pred, y_test))

Model trained in 15.997113140424092 min
0.7403094137123937
0.025094005104525884


In [10]:
cutoff = np.arange(0.005, 0.995, 0.005)
f1 = []
acc = []

pred = (model.predict_proba(X_test))[:,1]

for i in cutoff:
    f1.append(f1_score(pred >= i, y_test))
    acc.append(accuracy_score(pred >= i, y_test))

# print(f1)

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x = cutoff, y = f1, mode='lines', name='f1'))
fig.add_trace(go.Scatter(x = cutoff, y = acc, mode='lines', name='accuracy'))
fig.show()

In [11]:
GS_sent = pd.read_pickle("/content/drive/My Drive/Word2Vec_Modelling_Srijan/Data Produced/gs_sent.pickle")
GS_sent.columns = ["Sents", "doc_id"]
predictions = model.predict(X_pred)
probability_of_predictions = model.predict_proba(X_pred)

# this command gives the sentences that are detected important on Gold Standard Data
result = pd.DataFrame()

result = (GS_sent.iloc[predictions == 1])
result['probabilities'] = probability_of_predictions[predictions == 1][:,1]
result['doc_id'] = result['doc_id']

# length filter of output results
length = []
for i in np.array(result['Sents']):
  length.append(len(i)>=0)

result['Sents'] = (result.iloc[length])['Sents']
result.dropna(inplace = True)

result

ValueError: ignored