In [1]:
import random
from numpy.random import seed
seed(3)
import numpy as np
np.random.default_rng
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from pandas import read_csv
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from os.path import expanduser as ospath
import time
import os
os.chdir("C:/Users/rober/project/B_Data_pre_processing/My_functions")
from B_pre_processing_data import pre_process_db
from A_merge_datasets import merge_select_columns
from C_metrics_binary import calc_binary_class_metrics
os.chdir("C:/Users/rober/project/F_Extra_Experiments_Abstracts")

import xgboost as xgb
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.model_selection import KFold,GridSearchCV, cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from collections import Counter

from gensim.models import Word2Vec
import gensim

# Steps:

>1. Load Pickle Files
>2. Prepare the data to be augmented
>3. Data augmentation only for the minority class
>4. Data augmentation on both classes

## 1. Load Pickle Files

In [2]:
with open(ospath('~/final_project/E_Models/Abstracts/Pickle_Files/X_test_abstracts.pickle'), 'rb') as data:
    X_test = pickle.load(data)
    
with open(ospath('~/final_project/E_Models/Abstracts/Pickle_Files/y_test_abstracts.pickle'), 'rb') as data:
    y_test = pickle.load(data)

In [3]:
with open(ospath('~/final_project/E_Models/Abstracts/Pickle_Files/X_train_abstracts.pickle'), 'rb') as data:
    X_train = pickle.load(data)
    
with open(ospath('~/final_project/E_Models/Abstracts/Pickle_Files/y_train_abstracts.pickle'), 'rb') as data:
    y_train = pickle.load(data)

In [4]:
with open(ospath('~/final_project/B_Data_pre_processing/Pickle_Files/Abstracts_cleaned.pickle'), 'rb') as data:
    db = pickle.load(data)

In [5]:
with open(ospath('~/final_project/E_Models/Abstracts/Pickle_Files/xgb_model_abstracts.pickle'), 'rb') as data:
    xgb_fitted = pickle.load(data)

## 2. Prepare the data to be augmented

In [6]:
X_train_db = pd.DataFrame(X_train)
y_train_db = pd.DataFrame(y_train)
X_y_train_db = pd.concat([y_train_db,X_train_db], axis=1)
X_y_train_product = X_y_train_db[X_y_train_db['label'] == 1]
X_y_train_process = X_y_train_db[X_y_train_db['label'] == 0]

In [7]:
print('Data shape Features only labelled as product/product and process: \n',X_y_train_product.shape)
print('Data shape Features only labelled as process/use claim: \n',X_y_train_process.shape)

Data shape Features only labelled as product/product and process: 
 (106, 2)
Data shape Features only labelled as process/use claim: 
 (10, 2)


Initialisation dataset to store all the results

In [8]:
results_score = pd.DataFrame()

## 3. Data augmentation only for the minority class
Parameters set to: num_aug = 11 and alpha = 0.05

In [9]:
with open(ospath('~/final_project/F_Extra_Experiments_Abstracts/Data_for_augmentation/X_process.txt'), 'w' , encoding="utf-8") as file:
    for lab, text in zip(X_y_train_process['label'],X_y_train_process['text_clean']) :
        file.write(str(lab))
        file.write("\t")
        file.write(text)
        file.write("\n")

In [10]:
X_y_train_process

Unnamed: 0,label,text_clean
104,0,process produce solid nanocomposite particles ...
975,0,invention relate thin film solid state electro...
157,0,motor generator mg control via motor controlle...
537,0,method chemically modify carbon nanotubes diam...
288,0,method modify surface carbon materials vapor g...
779,0,method form electrolytic capacitor disclose me...
64,0,process include suspend electroactive material...
1318,0,pattern conduct polymer surface exhibit excell...
255,0,methods systems disclose manage plurality powe...
319,0,accord method manufacture solid electrolytic c...


### Load the text file with the data augmented and split the features from their targets values 

In [11]:
X_process_aug = pd.read_table(ospath('~/final_project/F_Extra_Experiments_Abstracts/Data_for_augmentation/X_process_augmented.txt'), sep = '\t', names=('label', 'text_clean'))
X_y_train_aug = pd.concat((X_y_train_product, X_process_aug)) 
X_train_aug = X_y_train_aug['text_clean']
y_train_aug = X_y_train_aug['label']
tfidf = TfidfVectorizer(ngram_range = (1,2) , encoding='utf-8',sublinear_tf=True, max_features = 300, max_df=0.25, min_df=4)

X_train_tf_aug = tfidf.fit_transform(X_train_aug).toarray()
X_test_tf = tfidf.transform(X_test).toarray()

Checks frequency of each class after the data augmentation

In [12]:
y_train_aug.value_counts()

0    120
1    106
Name: label, dtype: int64

### First, I run the best model loaded with Pickle file

I try first with the binary classification model already trained and loaded here - **project\E_Models\Abstracts**

In [13]:
random.seed(3)
xgb_fitted.fit(X_train_tf_aug, y_train_aug)
xbg_fitted_pred = xgb_fitted.predict(X_test_tf)

print("Accuracy on the test data: ")
print(metrics.accuracy_score(y_test, xbg_fitted_pred))
print("Classification report")
print(metrics.classification_report(y_test, xbg_fitted_pred))
print('Confusion Matrix : ')
print(metrics.confusion_matrix(y_test, xbg_fitted_pred))

Accuracy on the test data: 
0.9102564102564102
Classification report
              precision    recall  f1-score   support

           0       0.50      0.14      0.22         7
           1       0.92      0.99      0.95        71

    accuracy                           0.91        78
   macro avg       0.71      0.56      0.59        78
weighted avg       0.88      0.91      0.89        78

Confusion Matrix : 
[[ 1  6]
 [ 1 70]]


In [14]:
sc = calc_binary_class_metrics(y_test, xbg_fitted_pred, 'Orig. XB fitted', '1 class, n=11, a=0.05')
results_score = results_score.append(sc)
results_score

Unnamed: 0,Model,Data,Precision,Recall,Specificity,F1-score,Accuracy,Bal Accuracy,AUC
0,Orig. XB fitted,"1 class, n=11, a=0.05",0.921053,0.985915,0.142857,0.952381,0.910256,0.564386,0.564386


### Next I try with both the labels and balance with SMOTE

In [15]:
oversample = SMOTE()
X_train_tf_aug_bal, y_train_aug_bal = oversample.fit_resample(X_train_tf_aug, y_train_aug)

In [16]:
random.seed(3)
xbg_fitted_bal = xgb.XGBClassifier(objective = 'binary:logistic', n_jobs=-1)
xbg_fitted_bal.fit(X_train_tf_aug_bal, y_train_aug_bal)
xbg_bal_pred = xbg_fitted_bal.predict(X_test_tf)

print("Accuracy on the test data: ")
print(metrics.accuracy_score(y_test, xbg_bal_pred))
print("Classification report")
print(metrics.classification_report(y_test, xbg_bal_pred))
print('Confusion Matrix : ')
print(metrics.confusion_matrix(y_test, xbg_bal_pred))

Accuracy on the test data: 
0.8974358974358975
Classification report
              precision    recall  f1-score   support

           0       0.33      0.14      0.20         7
           1       0.92      0.97      0.95        71

    accuracy                           0.90        78
   macro avg       0.63      0.56      0.57        78
weighted avg       0.87      0.90      0.88        78

Confusion Matrix : 
[[ 1  6]
 [ 2 69]]


In [17]:
sc = calc_binary_class_metrics(y_test, xbg_bal_pred, 'Orig. XB fitted Bal', '1 class, n=11, a=0.05')
results_score = results_score.append(sc)
results_score

Unnamed: 0,Model,Data,Precision,Recall,Specificity,F1-score,Accuracy,Bal Accuracy,AUC
0,Orig. XB fitted,"1 class, n=11, a=0.05",0.921053,0.985915,0.142857,0.952381,0.910256,0.564386,0.564386
0,Orig. XB fitted Bal,"1 class, n=11, a=0.05",0.92,0.971831,0.142857,0.945205,0.897436,0.557344,0.557344


## Tune Hyperparameters TF-IDF 

In [18]:
random.seed(3)
pipeline_grid = Pipeline([
    ('tfidf', TfidfVectorizer(lowercase=False)),
    ('clf', OneVsRestClassifier(xgb.XGBClassifier(objective = 'binary:logistic', n_jobs=-1))),
])
param_grid = {
    'tfidf__norm' :('l1', 'l2'),
    'tfidf__max_features' : (300, 350, 400, 450,500),
    'tfidf__sublinear_tf' : [True, False]

}

grid_search_tfidf_xgb = GridSearchCV(pipeline_grid, param_grid, cv=2, verbose=1, n_jobs=-1)
grid_search_tfidf_xgb.fit(X_train_aug, y_train_aug)

print("Best parameters for the TF-IDF: {}".format(grid_search_tfidf_xgb.best_estimator_.steps))

Fitting 2 folds for each of 20 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Best parameters for the TF-IDF: [('tfidf', TfidfVectorizer(lowercase=False, max_features=300, sublinear_tf=True)), ('clf', OneVsRestClassifier(estimator=XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_depth=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=100, n_jobs=-1,
                                            num_parallel_tree=No

[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    2.7s finished


In [19]:
tfidf_tune_xb = grid_search_tfidf_xgb.best_estimator_[0]

X_train_tf_tune_xb = tfidf_tune_xb.fit_transform(X_train_aug).toarray()
X_test_tf_tune_xb = tfidf_tune_xb.transform(X_test).toarray()

In [20]:
oversample = SMOTE()
X_train_tf_tune_aug_bal, y_train_tf_tune_aug_bal = oversample.fit_resample(X_train_tf_tune_xb, y_train_aug)

In [21]:
random.seed(3)
xbg_tf_tune = xgb.XGBClassifier(objective = 'binary:logistic', n_jobs=-1)
xbg_tf_tune.fit(X_train_tf_tune_aug_bal, y_train_tf_tune_aug_bal)
xbg_tf_tune_pred = xbg_tf_tune.predict(X_test_tf_tune_xb)

print("Accuracy on the test data: ")
print(metrics.accuracy_score(y_test, xbg_tf_tune_pred))
print("Classification report")
print(metrics.classification_report(y_test, xbg_tf_tune_pred))
print('Confusion Matrix : ')
print(metrics.confusion_matrix(y_test, xbg_tf_tune_pred))

Accuracy on the test data: 
0.8461538461538461
Classification report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.90      0.93      0.92        71

    accuracy                           0.85        78
   macro avg       0.45      0.46      0.46        78
weighted avg       0.82      0.85      0.83        78

Confusion Matrix : 
[[ 0  7]
 [ 5 66]]


In [22]:
sc = calc_binary_class_metrics(y_test, xbg_tf_tune_pred, 'XB tf tune', '1 class, n=11, a=0.05')
results_score = results_score.append(sc)
results_score

Unnamed: 0,Model,Data,Precision,Recall,Specificity,F1-score,Accuracy,Bal Accuracy,AUC
0,Orig. XB fitted,"1 class, n=11, a=0.05",0.921053,0.985915,0.142857,0.952381,0.910256,0.564386,0.564386
0,Orig. XB fitted Bal,"1 class, n=11, a=0.05",0.92,0.971831,0.142857,0.945205,0.897436,0.557344,0.557344
0,XB tf tune,"1 class, n=11, a=0.05",0.90411,0.929577,0.0,0.916667,0.846154,0.464789,0.464789


## 4. Data augmentation for both classes
Parameters set to: num_aug = 11 and alpha = 0.05

In [23]:
with open(ospath('~/final_project/F_Extra_Experiments_Abstracts/Data_for_augmentation/X_y_train.txt'), 'w' , encoding="utf-8") as file:
    for lab, text in zip(X_y_train_db['label'],X_y_train_db['text_clean']) :
        file.write(str(lab))
        file.write("\t")
        file.write(text)
        file.write("\n")

In [24]:
X_y_train_aug = pd.read_table(ospath('~/final_project/F_Extra_Experiments_Abstracts/Data_for_augmentation/X_y_train_augmented.txt'), sep = '\t', names=('label', 'text_clean'))

In [25]:
X_y_train_aug

Unnamed: 0,label,text_clean
0,1,solid electrolytic capacitor obtain employ sol...
1,1,solid electrolytic capacitor obtain employ sol...
2,1,square electrolytic capacitor obtain employ sq...
3,1,solid electrolytic capacitor obtain employ sol...
4,1,solid electrolytic capacitor obtain employ sol...
...,...,...
1387,1,provide examples electrochemically active elec...
1388,1,provide examples electrochemically active elec...
1389,1,provide examples electrochemically active elec...
1390,1,provide examples electrochemically active elec...


In [26]:
X_train_both_aug = X_y_train_aug['text_clean']
y_train_both_aug = X_y_train_aug['label']
tfidf = TfidfVectorizer(ngram_range = (1,2) , encoding='utf-8',sublinear_tf=True, max_features = 300, max_df=0.25, min_df=4)

X_train_both_tf_aug = tfidf.fit_transform(X_train_both_aug).toarray()
X_test_tf = tfidf.transform(X_test).toarray()

In [27]:
y_train_both_aug.shape

(1392,)

In [28]:
y_train_both_aug.value_counts()

1    1272
0     120
Name: label, dtype: int64

In [29]:
X_train_both_aug.shape

(1392,)

In [30]:
oversample = SMOTE()
X_train_both_tf_aug_bal, y_train_both_aug_bal = oversample.fit_resample(X_train_both_tf_aug, y_train_both_aug)

In [31]:
random.seed(3)
xbg_both_lab_bal = xgb.XGBClassifier(objective = 'binary:logistic', n_jobs=-1)
xbg_both_lab_bal.fit(X_train_both_tf_aug_bal, y_train_both_aug_bal)
xbg__both_lab_pred = xbg_both_lab_bal.predict(X_test_tf)

print("Accuracy on the test data: ")
print(metrics.accuracy_score(y_test, xbg__both_lab_pred))
print("Classification report")
print(metrics.classification_report(y_test, xbg__both_lab_pred))
print('Confusion Matrix : ')
print(metrics.confusion_matrix(y_test, xbg__both_lab_pred))

Accuracy on the test data: 
0.8717948717948718
Classification report
              precision    recall  f1-score   support

           0       0.20      0.14      0.17         7
           1       0.92      0.94      0.93        71

    accuracy                           0.87        78
   macro avg       0.56      0.54      0.55        78
weighted avg       0.85      0.87      0.86        78

Confusion Matrix : 
[[ 1  6]
 [ 4 67]]


In [32]:
sc = calc_binary_class_metrics(y_test, xbg__both_lab_pred, 'XB not tune', '2 class, n=11, a=0.05')
results_score = results_score.append(sc)
results_score

Unnamed: 0,Model,Data,Precision,Recall,Specificity,F1-score,Accuracy,Bal Accuracy,AUC
0,Orig. XB fitted,"1 class, n=11, a=0.05",0.921053,0.985915,0.142857,0.952381,0.910256,0.564386,0.564386
0,Orig. XB fitted Bal,"1 class, n=11, a=0.05",0.92,0.971831,0.142857,0.945205,0.897436,0.557344,0.557344
0,XB tf tune,"1 class, n=11, a=0.05",0.90411,0.929577,0.0,0.916667,0.846154,0.464789,0.464789
0,XB not tune,"2 class, n=11, a=0.05",0.917808,0.943662,0.142857,0.930556,0.871795,0.54326,0.54326


In [33]:
random.seed(3)
pipeline_grid = Pipeline([
    ('tfidf', TfidfVectorizer(lowercase=False)),
    ('clf', OneVsRestClassifier(xgb.XGBClassifier(objective = 'binary:logistic', n_jobs=-1))),
])
param_grid = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__min_df' : (2,4,6,8,10),
    'tfidf__max_features' : (300, 350, 400, 450),
    'tfidf__sublinear_tf' : [True, False]

}

grid_search_tfidf_xgb = GridSearchCV(pipeline_grid, param_grid, cv=2, verbose=1, n_jobs=-1)
grid_search_tfidf_xgb.fit(X_train_both_aug, y_train_both_aug)

print("Best parameters for the TF-IDF: {}".format(grid_search_tfidf_xgb.best_estimator_.steps))

Fitting 2 folds for each of 120 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:    8.9s finished


Best parameters for the TF-IDF: [('tfidf', TfidfVectorizer(lowercase=False, max_df=0.25, max_features=450, min_df=6,
                sublinear_tf=True)), ('clf', OneVsRestClassifier(estimator=XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_depth=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=100, n_jobs=-1,
                         

In [34]:
tfidf = grid_search_tfidf_xgb.best_estimator_[0]

X_train_both_tf_aug_tune = tfidf.fit_transform(X_train_both_aug).toarray()
X_test_tf_tune = tfidf.transform(X_test).toarray()#test data from the manually labelled dataset

In [35]:
oversample = SMOTE()
X_train_both_tf_aug_bal_tune, y_train_both_aug_bal_tune = oversample.fit_resample(X_train_both_tf_aug_tune, y_train_both_aug)

In [36]:
random.seed(3)
xbg_both_lab_bal_tune = xgb.XGBClassifier(objective = 'binary:logistic', n_jobs=-1)
xbg_both_lab_bal_tune.fit(X_train_both_tf_aug_bal_tune, y_train_both_aug_bal_tune)
xbg__both_lab_pred_tune = xbg_both_lab_bal_tune.predict(X_test_tf_tune)

print("Accuracy on the test data: ")
print(metrics.accuracy_score(y_test, xbg__both_lab_pred_tune))
print("Classification report")
print(metrics.classification_report(y_test, xbg__both_lab_pred_tune))
print('Confusion Matrix : ')
print(metrics.confusion_matrix(y_test, xbg__both_lab_pred_tune))

Accuracy on the test data: 
0.8717948717948718
Classification report
              precision    recall  f1-score   support

           0       0.20      0.14      0.17         7
           1       0.92      0.94      0.93        71

    accuracy                           0.87        78
   macro avg       0.56      0.54      0.55        78
weighted avg       0.85      0.87      0.86        78

Confusion Matrix : 
[[ 1  6]
 [ 4 67]]


In [37]:
sc = calc_binary_class_metrics(y_test, xbg__both_lab_pred_tune, 'XB TF tune', '2 class, n=11, a=0.05')
results_score = results_score.append(sc)
results_score

Unnamed: 0,Model,Data,Precision,Recall,Specificity,F1-score,Accuracy,Bal Accuracy,AUC
0,Orig. XB fitted,"1 class, n=11, a=0.05",0.921053,0.985915,0.142857,0.952381,0.910256,0.564386,0.564386
0,Orig. XB fitted Bal,"1 class, n=11, a=0.05",0.92,0.971831,0.142857,0.945205,0.897436,0.557344,0.557344
0,XB tf tune,"1 class, n=11, a=0.05",0.90411,0.929577,0.0,0.916667,0.846154,0.464789,0.464789
0,XB not tune,"2 class, n=11, a=0.05",0.917808,0.943662,0.142857,0.930556,0.871795,0.54326,0.54326
0,XB TF tune,"2 class, n=11, a=0.05",0.917808,0.943662,0.142857,0.930556,0.871795,0.54326,0.54326


### None of them have improved the original binary classification model trained with manually labelled data.