In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context


from gensim.models import Word2Vec
from time import time 
import multiprocessing
import logging  # logger
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

from sklearn.manifold import TSNE

import warnings
warnings.filterwarnings('ignore')

# pd.set_option('display.max_colwidth', None)

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, roc_curve, auc

In [2]:
import pickle
import os

In [6]:
data_file_path = os.path.abspath(os.path.join(os.pardir,'data','cleaned_AG.csv'))
data = pd.read_csv(data_file_path)
data.head()

Unnamed: 0,News Category,Documents,Word Count,Noun Phrases,Noun Count,Adjective Count,Verb Count
0,business,wall st bear claw back black reuters reuters s...,18,4,12,3,2
1,business,carlyle look toward commercial aerospace reute...,27,5,15,4,3
2,business,oil economy cloud stock outlook reuters reuter...,24,5,17,4,2
3,business,iraq halt oil export main southern pipeline re...,28,3,19,6,3
4,business,oil price soar time record posing new menace e...,28,4,16,7,3


# Drop Title & Description

In [7]:
data['Documents'] = data['Documents'].str.split(" ")
data.head()

Unnamed: 0,News Category,Documents,Word Count,Noun Phrases,Noun Count,Adjective Count,Verb Count
0,business,"[wall, st, bear, claw, back, black, reuters, r...",18,4,12,3,2
1,business,"[carlyle, look, toward, commercial, aerospace,...",27,5,15,4,3
2,business,"[oil, economy, cloud, stock, outlook, reuters,...",24,5,17,4,2
3,business,"[iraq, halt, oil, export, main, southern, pipe...",28,3,19,6,3
4,business,"[oil, price, soar, time, record, posing, new, ...",28,4,16,7,3


# Word Embeddings using Word2Vec algorithm

In [8]:
cores = multiprocessing.cpu_count()

# initializing word2vec model
model = Word2Vec(min_count=20,
                     window=2, # window size for context 
                     vector_size=100,  # no of features 
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

INFO - 00:20:41: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=100, alpha=0.03)', 'datetime': '2021-05-09T00:20:41.325171', 'gensim': '4.0.1', 'python': '3.7.5 (default, Oct 25 2019, 10:52:18) \n[Clang 4.0.1 (tags/RELEASE_401/final)]', 'platform': 'Darwin-19.6.0-x86_64-i386-64bit', 'event': 'created'}


In [9]:
t = time()

# build vocabulary
model.build_vocab(data['Documents'], progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 00:20:41: collecting all words and their counts
INFO - 00:20:41: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 00:20:41: PROGRESS: at sentence #10000, processed 273130 words, keeping 18981 word types
INFO - 00:20:42: PROGRESS: at sentence #20000, processed 542961 words, keeping 26350 word types
INFO - 00:20:42: PROGRESS: at sentence #30000, processed 810845 words, keeping 31364 word types
INFO - 00:20:42: PROGRESS: at sentence #40000, processed 1079542 words, keeping 35363 word types
INFO - 00:20:42: PROGRESS: at sentence #50000, processed 1347719 words, keeping 38795 word types
INFO - 00:20:42: PROGRESS: at sentence #60000, processed 1617217 words, keeping 42012 word types
INFO - 00:20:42: PROGRESS: at sentence #70000, processed 1887970 words, keeping 44832 word types
INFO - 00:20:42: PROGRESS: at sentence #80000, processed 2156362 words, keeping 47460 word types
INFO - 00:20:42: PROGRESS: at sentence #90000, processed 2420763 words, keeping 50103 wor

Time to build vocab: 0.02 mins


In [10]:
t = time()

#train word2vec model 
model.train(data['Documents'], total_examples=model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 00:20:43: Word2Vec lifecycle event {'msg': 'training model with 7 workers on 12033 vocabulary and 100 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2', 'datetime': '2021-05-09T00:20:43.037260', 'gensim': '4.0.1', 'python': '3.7.5 (default, Oct 25 2019, 10:52:18) \n[Clang 4.0.1 (tags/RELEASE_401/final)]', 'platform': 'Darwin-19.6.0-x86_64-i386-64bit', 'event': 'train'}
INFO - 00:20:44: EPOCH 1 - PROGRESS: at 40.37% examples, 799298 words/s, in_qsize 13, out_qsize 0
INFO - 00:20:45: EPOCH 1 - PROGRESS: at 78.09% examples, 768041 words/s, in_qsize 12, out_qsize 1
INFO - 00:20:45: worker thread finished; awaiting finish of 6 more threads
INFO - 00:20:45: worker thread finished; awaiting finish of 5 more threads
INFO - 00:20:45: worker thread finished; awaiting finish of 4 more threads
INFO - 00:20:45: worker thread finished; awaiting finish of 3 more threads
INFO - 00:20:45: worker thread finished; awaiting finish of 2 more threads
INFO - 00:20:45: worker thread finished

INFO - 00:21:11: worker thread finished; awaiting finish of 0 more threads
INFO - 00:21:11: EPOCH - 9 : training on 3420491 raw words (1974526 effective words) took 3.5s, 570533 effective words/s
INFO - 00:21:12: EPOCH 10 - PROGRESS: at 28.97% examples, 575823 words/s, in_qsize 13, out_qsize 0
INFO - 00:21:13: EPOCH 10 - PROGRESS: at 55.71% examples, 550810 words/s, in_qsize 11, out_qsize 2
INFO - 00:21:14: EPOCH 10 - PROGRESS: at 86.34% examples, 565840 words/s, in_qsize 12, out_qsize 1
INFO - 00:21:14: worker thread finished; awaiting finish of 6 more threads
INFO - 00:21:14: worker thread finished; awaiting finish of 5 more threads
INFO - 00:21:14: worker thread finished; awaiting finish of 4 more threads
INFO - 00:21:14: worker thread finished; awaiting finish of 3 more threads
INFO - 00:21:14: worker thread finished; awaiting finish of 2 more threads
INFO - 00:21:14: worker thread finished; awaiting finish of 1 more threads
INFO - 00:21:14: worker thread finished; awaiting finish 

INFO - 00:21:45: worker thread finished; awaiting finish of 6 more threads
INFO - 00:21:45: worker thread finished; awaiting finish of 5 more threads
INFO - 00:21:45: worker thread finished; awaiting finish of 4 more threads
INFO - 00:21:45: worker thread finished; awaiting finish of 3 more threads
INFO - 00:21:45: worker thread finished; awaiting finish of 2 more threads
INFO - 00:21:45: worker thread finished; awaiting finish of 1 more threads
INFO - 00:21:45: worker thread finished; awaiting finish of 0 more threads
INFO - 00:21:45: EPOCH - 18 : training on 3420491 raw words (1974944 effective words) took 4.2s, 472398 effective words/s
INFO - 00:21:46: EPOCH 19 - PROGRESS: at 21.40% examples, 425696 words/s, in_qsize 14, out_qsize 3
INFO - 00:21:47: EPOCH 19 - PROGRESS: at 45.87% examples, 450257 words/s, in_qsize 12, out_qsize 1
INFO - 00:21:48: EPOCH 19 - PROGRESS: at 72.17% examples, 469710 words/s, in_qsize 13, out_qsize 0
INFO - 00:21:49: EPOCH 19 - PROGRESS: at 96.29% examples

INFO - 00:22:18: worker thread finished; awaiting finish of 5 more threads
INFO - 00:22:18: worker thread finished; awaiting finish of 4 more threads
INFO - 00:22:18: worker thread finished; awaiting finish of 3 more threads
INFO - 00:22:18: worker thread finished; awaiting finish of 2 more threads
INFO - 00:22:18: worker thread finished; awaiting finish of 1 more threads
INFO - 00:22:18: worker thread finished; awaiting finish of 0 more threads
INFO - 00:22:18: EPOCH - 26 : training on 3420491 raw words (1974535 effective words) took 3.8s, 516170 effective words/s
INFO - 00:22:19: EPOCH 27 - PROGRESS: at 22.56% examples, 426657 words/s, in_qsize 12, out_qsize 1
INFO - 00:22:20: EPOCH 27 - PROGRESS: at 46.17% examples, 443750 words/s, in_qsize 14, out_qsize 0
INFO - 00:22:21: EPOCH 27 - PROGRESS: at 69.22% examples, 444543 words/s, in_qsize 13, out_qsize 0
INFO - 00:22:22: EPOCH 27 - PROGRESS: at 91.05% examples, 440075 words/s, in_qsize 13, out_qsize 1
INFO - 00:22:23: worker thread f

Time to train the model: 1.96 mins


In [15]:
# save model
model.save('model.bin')

INFO - 22:21:22: Word2Vec lifecycle event {'fname_or_handle': 'model.bin', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-04-29T22:21:22.924489', 'gensim': '4.0.1', 'python': '3.8.8 (default, Apr 13 2021, 19:58:26) \n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-142-generic-x86_64-with-glibc2.10', 'event': 'saving'}
INFO - 22:21:22: not storing attribute cum_table
INFO - 22:21:22: saved model.bin


# Feature Extraction (word2vec)

Base line number of features = 100

In [11]:
def make_feature_vec(words, model, num_features):
    """
    Average the word vectors for a set of words
    """
    feature_vec = np.zeros((num_features,),dtype="float32")  # pre-initialize (for speed)
    nwords = 0.
    index2word_set = set(model.wv.index_to_key)  # words known to the model

    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            feature_vec = np.add(feature_vec,model.wv[word])
    
    feature_vec = np.divide(feature_vec, nwords)
    return feature_vec


def get_avg_feature_vecs(words, model, num_features):
    """
    Calculate average feature vectors for all headlines 
    """
    counter = 0
    feature_vecs = np.zeros((len(words),num_features), dtype='float32')  # pre-initialize (for speed)
    
    for word in words:
        feature_vecs[counter] = make_feature_vec(word, model, num_features)
        counter = counter + 1
    return feature_vecs

In [12]:
word2vec = get_avg_feature_vecs(data['Documents'], model, 100)

In [14]:
# remove instances in test set that could not be represented as feature vectors
nan_indices = list({x for x,y in np.argwhere(np.isnan(word2vec))})
if len(nan_indices) > 0:
    print('Removing {:d} instances from test set.'.format(len(nan_indices)))
    word2vec = np.delete(word2vec, nan_indices, axis=0)
    word2vec.drop(data.iloc[nan_indices, :].index, axis=0, inplace=True)
    assert word2vec.shape[0] == len(data)

In [15]:
w2v = pd.DataFrame(word2vec)

data.reset_index(drop=True, inplace=True)
w2v.reset_index(drop=True, inplace=True)
#df = pd.concat([df1, df2], axis=1)
w2v = pd.concat([data[['News Category','Word Count','Noun Phrases','Noun Count',
                                         'Adjective Count','Verb Count']],w2v],axis=1)

w2v

Unnamed: 0,News Category,Word Count,Noun Phrases,Noun Count,Adjective Count,Verb Count,0,1,2,3,...,90,91,92,93,94,95,96,97,98,99
0,business,18,4,12,3,2,0.055361,0.005058,0.489468,0.389286,...,0.621230,-0.032878,-0.113240,-0.350542,-0.084564,0.141340,0.004108,-0.028677,-0.228371,-0.661437
1,business,27,5,15,4,3,0.206475,-0.223033,0.390895,0.132415,...,0.237479,-0.250275,-0.061764,-0.402763,-0.023698,-0.627297,-0.115820,0.137561,-0.042825,-0.301519
2,business,24,5,17,4,2,0.209188,-0.541559,0.265383,0.272109,...,0.819905,-0.628630,-0.246094,0.172992,-0.061221,0.641063,0.570838,0.560289,0.358528,-0.670001
3,business,28,3,19,6,3,-0.894695,-0.992983,-0.298582,-0.362631,...,0.380259,-0.767296,-0.975848,-0.347559,-0.095349,0.726570,0.460381,0.469100,0.352567,-0.934525
4,business,28,4,16,7,3,0.044101,-0.751984,0.135745,0.037640,...,0.716525,-0.143298,-0.249741,-0.093775,0.275794,0.498189,0.350819,0.324995,0.294263,-0.212797
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127595,world,19,3,9,4,5,0.022802,-0.905938,-0.137237,0.068300,...,-0.012326,-0.463617,0.058749,-0.627807,0.232156,-0.321579,-0.591594,0.076990,-0.105051,0.569126
127596,sports,41,7,17,9,10,-0.211614,-0.008620,-0.086450,0.280564,...,0.563488,-0.361268,-0.459734,-0.018276,0.077644,-0.300676,-0.209231,-0.541929,0.168749,-0.367598
127597,sports,20,5,9,3,3,0.097023,-0.319842,-0.016024,-0.086342,...,0.534790,-0.440539,-0.260372,-0.423392,0.239280,-0.125498,-0.429946,-0.387016,0.422862,-0.818029
127598,business,21,4,10,4,4,-0.810561,-0.740219,-0.542993,0.131120,...,-0.410788,-0.506708,-0.501324,-0.352074,0.023445,-0.428003,0.191768,0.358949,0.569395,0.283652


In [16]:
# X dataframe 
X = w2v.drop(['News Category'],axis=1) 
# y series
y = w2v['News Category']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

# Classifications Models

# Logistic Regression Classifier

In [17]:
from sklearn.linear_model import LogisticRegression

In [3]:
logit = LogisticRegression()

print("Fitting a Logistic Regression model to labeled training data...")

start_time = time()

logit.fit(X_train,y_train)pre

lr_tt = round((time() - start_time) / 60, 2)

print('Time to train Logistic Regression Model: {} mins'.format(lr_tt))


Fitting a Logistic Regression model to labeled training data...


NameError: name 'X_train' is not defined

Logistic Regression

In [19]:
lr = LogisticRegression(solver='newton-cg')

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
lr_acc = []

for train_index, vaild_index in skf.split(X_train,y_train):
    x_t, x_v = X_train.iloc[train_index], X_train.iloc[vaild_index]
    y_t, y_v = y_train.iloc[train_index], y_train.iloc[vaild_index]
    lr.fit(x_t, y_t)
    lr_acc.append(lr.score(x_v, y_v))

In [20]:
print(lr_acc)

[0.8866435288849082, 0.8955441110613525, 0.8924652933273622, 0.8947044334975369, 0.8905620241827138]


In [21]:
print(np.mean(lr_acc))

0.8919838781907747


Model Evaluation

In [22]:
from sklearn.preprocessing import OneHotEncoder

y_test = y_test.to_numpy()
y_pred = lr.predict(X_test)

lr_acc = accuracy_score(y_test,y_pred)
lr_recall = recall_score(y_test,y_pred,average='macro')
lr_precision = precision_score(y_test,y_pred,average='macro')
lr_f1 = f1_score(y_test,y_pred,average='macro')

y_pred_roc = OneHotEncoder().fit_transform(y_pred.reshape(-1, 1)).toarray()
y_test_roc = OneHotEncoder().fit_transform(y_test.reshape(-1, 1)).toarray()
lr_roc = roc_auc_score(y_test_roc,y_pred_roc,multi_class='ovo')

In [23]:
print("Logistic Regression: ",lr_acc,lr_recall,lr_precision,lr_f1,lr_roc)

Logistic Regression:  0.8925548589341693 0.8926048131834262 0.8924548445701799 0.8924284148593548 0.9283909985869295


# Save the Model

In [27]:
Pkl_Filename = "Pickle_RL_Model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(lr, file)