In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [3]:
result = pd.read_pickle("unbalanced_dataset_features.pkl")

In [4]:
result.head()

Unnamed: 0,articles1,paragraphs1,articles2,paragraphs2,label,cosine_similarity,common_words
0,Article 1,This Regulation lays down uniform rules concer...,Article 460,1. The Commission shall be empowered to adop...,1,0.448342,29
1,Article 4,"1. For the purposes of this Regulation, the ...",Article 1,This Regulation lays down uniform rules concer...,1,0.482119,57
2,Article 4,"1. For the purposes of this Regulation, the ...",Article 2,For the purposes of ensuring compliance with t...,1,0.458057,17
3,Article 4,"1. For the purposes of this Regulation, the ...",Article 4,"1. For the purposes of this Regulation, the ...",1,1.0,1423
4,Article 4,"1. For the purposes of this Regulation, the ...",Article 25,The Tier 1 capital of an institution consists ...,1,0.55575,13


In [5]:
result.groupby('label').count()

Unnamed: 0_level_0,articles1,paragraphs1,articles2,paragraphs2,cosine_similarity,common_words
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,272339,272339,272339,272339,272339,272339
1,1190,1190,1190,1190,1190,1190


In [6]:
from sklearn.utils import resample

In [7]:
# Separate majority and minority classes
result_majority = result[result.label==0]
result_minority = result[result.label==1]

In [10]:
result_minority_upsampled = resample(result_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=272339,    # to match majority class
                                 random_state=123) # reproducible results


In [11]:
df_upsampled = pd.concat([result_majority, result_minority_upsampled])

In [12]:
df_upsampled.label.value_counts()

1    272339
0    272339
Name: label, dtype: int64

In [23]:
df_upsampled

Unnamed: 0,articles1,paragraphs1,articles2,paragraphs2,label,cosine_similarity,common_words
0,Article 1,This Regulation lays down uniform rules concer...,Article 1,This Regulation lays down uniform rules concer...,0,1.000000,79
1,Article 1,This Regulation lays down uniform rules concer...,Article 2,For the purposes of ensuring compliance with t...,0,0.380349,14
2,Article 1,This Regulation lays down uniform rules concer...,Article 3,This Regulation shall not prevent institutions...,0,0.277306,10
3,Article 1,This Regulation lays down uniform rules concer...,Article 4,"1. For the purposes of this Regulation, the ...",0,0.482119,57
4,Article 1,This Regulation lays down uniform rules concer...,Article 5,"For the purposes of Part Three, Title II, the ...",0,0.280180,8
5,Article 1,This Regulation lays down uniform rules concer...,Article 6,1. Institutions shall comply with the obliga...,0,0.437404,20
6,Article 1,This Regulation lays down uniform rules concer...,Article 7,1. Competent authorities may waive the appli...,0,0.430868,27
7,Article 1,This Regulation lays down uniform rules concer...,Article 8,1. The competent authorities may waive in fu...,0,0.499715,33
8,Article 1,This Regulation lays down uniform rules concer...,Article 9,1. Subject to paragraphs 2 and 3 of this to...,0,0.495664,23
9,Article 1,This Regulation lays down uniform rules concer...,Article 10,"1. Competent authorities may, in accordance ...",0,0.459969,28


In [14]:
feature_headers = ['cosine_similarity', 'common_words']
target_header = ['label']

In [17]:
train_x, test_x, train_y, test_y = train_test_split(df_upsampled[feature_headers], df_upsampled[target_header],
                                                        train_size=0.8)



In [18]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=500, random_state=0) 
rf.fit(train_x, train_y) 

  after removing the cwd from sys.path.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [19]:
predictions = rf.predict(test_x)

In [20]:
print('Accuracy:', accuracy_score(test_y, predictions))
print('F1 score:', f1_score(test_y, predictions))
print('Recall:', recall_score(test_y, predictions))
print('Precision:', precision_score(test_y, predictions))

Accuracy: 0.9972460894470148
F1 score: 0.9972537532039546
Recall: 1.0
Precision: 0.9945225488406062


In [21]:
from sklearn.metrics import confusion_matrix

In [22]:
confusion_matrix(test_y, predictions)

array([[54166,   300],
       [    0, 54470]], dtype=int64)

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

logistic_regression_model = LogisticRegression()
trained_logistic_regression_model = logistic_regression_model.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


In [25]:
weighted_prediction = trained_logistic_regression_model.predict(test_x)

In [26]:
print('Accuracy:', accuracy_score(test_y, weighted_prediction))
print('F1 score:', f1_score(test_y, weighted_prediction))
print('Recall:', recall_score(test_y, weighted_prediction))
print('Precision:', precision_score(test_y, weighted_prediction))

Accuracy: 0.7137034589116545
F1 score: 0.7073362986318338
Recall: 0.6919221589865981
Precision: 0.7234528562653563


In [27]:
confusion_matrix(test_y, weighted_prediction)

array([[40059, 14407],
       [16781, 37689]], dtype=int64)