In [1]:
import random
from numpy.random import seed
seed(3)
import numpy as np
np.random.default_rng
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from pandas import read_csv
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from os.path import expanduser as ospath
import time

import xgboost as xgb
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, fbeta_score, make_scorer, precision_score, accuracy_score, f1_score,  recall_score, confusion_matrix, roc_auc_score
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models import Word2Vec
import gensim

# 1. Load data set with cleaned text

In [2]:
with open(ospath('~/code_final_project/B_Data_pre_processing/Pickle_Files/Abstracts_cleaned.pickle'), 'rb') as data:
    db = pickle.load(data)

In [3]:
db.head()

Unnamed: 0,publn_nr,type,label,text,text_clean
0,4254843,Abstract,product,An electrically powered vehicle having a bank ...,electrically power vehicle bank batteries supp...
20,5680032,Abstract,product,During forward motion of an electrically-power...,forward motion electrically-powered vehicle ai...
51,5272378,Abstract,product,An apparatus for generating power by utilizing...,apparatus generate power utilize wind produce ...
68,4309620,Abstract,product,"In a preferred embodiment, the flywheel electr...",prefer embodiment flywheel electric transmissi...
118,5514923,Abstract,product,A high efficiency multi-phasic type DC motor i...,high efficiency multi-phasic type dc motor inc...


In [4]:
db.groupby(by='label').count()

Unnamed: 0_level_0,publn_nr,type,text,text_clean
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
process,13,13,13,13
product,154,154,154,154
product and process,23,23,23,23
use claim,4,4,4,4


This convert name labels form string to numbers and merge into only two classes 

In [5]:
db['label'] = db.label.replace(to_replace=['product', 'process', 'use claim', 'product and process'], value=[1, 0, 0,1])
db.groupby(by='label').count()

Unnamed: 0_level_0,publn_nr,type,text,text_clean
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,17,17,17,17
1,177,177,177,177


## 2. Split the labelled data

In [6]:
y = db['label']# for stratify function

split into train and test datasets

In [7]:
X_train, X_test, y_train , y_test = train_test_split(db['text_clean'], 
                                                    db['label'], test_size=0.4, random_state=8, stratify = y)

In [8]:
model_w2v = gensim.models.Word2Vec.load(ospath('~/code_final_project/C_Feature_extraction/model_w2v_300_uspto_epo'))

In [9]:
class MeanEmbeddingVectorizer(object):
    
    """This class builds the features
    by averanging the word vectors
    for the words in the text"""
    
    def __init__(self, word2vec, dim):
        self.word2vec = word2vec
        self.dim = dim

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
# reference http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/

In [10]:
w2v = {w: vec for w, vec in zip(model_w2v.wv.index2word, model_w2v.wv.syn0)}

In [11]:

#pipelines of the models with different vectorization methods
logi_model_tfidf = Pipeline([("tfidf", TfidfVectorizer(lowercase=False,ngram_range=(1,2), max_features=300,sublinear_tf=True)),  ('LOGI', LogisticRegression(random_state = 0, n_jobs=-1))])
logi_model_w2v = Pipeline([("word2vec",MeanEmbeddingVectorizer(w2v, 300)) , ('LOGI', LogisticRegression(random_state = 0,n_jobs=-1))])

xgb_model_tfidf = Pipeline([("tfidf", TfidfVectorizer(lowercase=False,ngram_range=(1,2), max_features=300,sublinear_tf=True)),  ('XGB', xgb.XGBClassifier(objective = 'binary:logistic', n_jobs=-1))])
xgb_model_w2v = Pipeline([("word2vec",MeanEmbeddingVectorizer(w2v,300)) , ('XGB', xgb.XGBClassifier(objective = 'binary:logistic', n_jobs=-1))])

SVM_model_tfidf = Pipeline([("tfidf", TfidfVectorizer(lowercase=False,ngram_range=(1,2), max_features=300,sublinear_tf=True)) , ('SVM', SVC(random_state = 0))])
SVM_model_w2v = Pipeline([("word2vec",MeanEmbeddingVectorizer(w2v, 300)) , ('SVM', SVC(random_state = 0))])

RFC_model_tfidf = Pipeline([("tfidf", TfidfVectorizer(lowercase=False,ngram_range=(1,2), max_features=300,sublinear_tf=True)),('RFC', RandomForestClassifier(n_jobs=-1))])
RFC_model_w2v = Pipeline([("word2vec",MeanEmbeddingVectorizer(w2v, 300)),('RFC', RandomForestClassifier(random_state = 0, n_jobs=-1))])

models = []
models.append(('LOGI_tfidf', logi_model_tfidf))
models.append(('LOGI_w2v', logi_model_w2v))
models.append(('XGB_tfidf', xgb_model_tfidf))
models.append(('xgb__w2v', xgb_model_w2v))
models.append(('SVM_tfidf', SVM_model_tfidf))
models.append(('SVM_w2v', SVM_model_w2v))
models.append(('RFC_tfidf', RFC_model_tfidf))
models.append(('RFC_w2v', RFC_model_w2v))

#for the parameters scoring 
scoring_acc = metrics.make_scorer(metrics.accuracy_score)
scoring_prec = metrics.make_scorer(metrics.precision_score)
scoring_rec = metrics.make_scorer(metrics.recall_score)
scoring_f1 = metrics.make_scorer(metrics.f1_score)

score_list = []
score_list.append(scoring_acc)
score_list.append(scoring_prec)
score_list.append(scoring_rec)
score_list.append(scoring_f1)

names_score = ['acc', 'prec', 'rec', 'f1']
name_score_model = []

#to store results of cv in a data set
results_std = []
results_mean = []
results = []

#
time_models =[]

for name, model in models:
    for score, name_score in zip(score_list, names_score):
        kfold = StratifiedKFold(n_splits=5, random_state=1)
        
        #timing the model
        start = time.time()
        results_cv = cross_val_score(model, X_train, y_train, cv=kfold, scoring= score)
        end = time.time()
        
        results_mean.append(round(results_cv.mean(),5))
        results_std.append(round(results_cv.std(),5))
        results.append(results_cv)
        time_models.append(round(end - start, 5))
        name_score_model.append((name, name_score))
        
name_dict_db_results = [' '.join(map(str,name_score_model[i])) for i in range(len(name_score_model))]
dictionary_results = {'Model': name_dict_db_results, 'Mean': results_mean, 'Std': results_std, 'time_models' :time_models } 
db_results = pd.DataFrame(dictionary_results)

#Adapted from - Brownlee, J. (2016) Machine learning mastery with Python: understand your data, create accurate models, and work projects end-to-end.

In [12]:
db_results

Unnamed: 0,Model,Mean,Std,time_models
0,LOGI_tfidf acc,0.91377,0.00145,2.51451
1,LOGI_tfidf prec,0.91377,0.00145,1.81618
2,LOGI_tfidf rec,1.0,0.0,0.79388
3,LOGI_tfidf f1,0.95494,0.00079,0.12168
4,LOGI_w2v acc,0.91377,0.00145,34.86831
5,LOGI_w2v prec,0.91377,0.00145,35.81436
6,LOGI_w2v rec,1.0,0.0,37.58057
7,LOGI_w2v f1,0.95494,0.00079,38.35905
8,XGB_tfidf acc,0.89638,0.02193,1.59075
9,XGB_tfidf prec,0.91219,0.00285,0.61336
