In [9]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context


from gensim.models import Word2Vec
from time import time 
import multiprocessing
import logging  # logger
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

from sklearn.manifold import TSNE

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_colwidth', None)

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, roc_curve, auc

In [10]:
data = pd.read_csv('cleaned_AG.csv')
data.head()

Unnamed: 0,News Category,Documents,Word Count,Noun Phrases,Noun Count,Adjective Count,Verb Count
0,business,wall st bear claw back black reuters reuters short seller wall street dwindling band ultra cynic seeing green,18,4,12,3,2
1,business,carlyle look toward commercial aerospace reuters reuters private investment firm carlyle group reputation making well timed occasionally controversial play defense industry quietly placed bet another part market,27,5,15,4,3
2,business,oil economy cloud stock outlook reuters reuters soaring crude price plus worry economy outlook earnings expected hang stock market next week depth summer doldrums,24,5,17,4,2
3,business,iraq halt oil export main southern pipeline reuters reuters authority halted oil export flow main pipeline southern iraq intelligence showed rebel militia strike infrastructure oil official said saturday,28,3,19,6,3
4,business,oil price soar time record posing new menace economy afp afp tearaway world oil price toppling record straining wallet present new economic menace barely three month presidential election,28,4,16,7,3


# Drop Title & Description

In [11]:
data['Documents'] = data['Documents'].str.split(" ")
data.head()

Unnamed: 0,News Category,Documents,Word Count,Noun Phrases,Noun Count,Adjective Count,Verb Count
0,business,"[wall, st, bear, claw, back, black, reuters, reuters, short, seller, wall, street, dwindling, band, ultra, cynic, seeing, green]",18,4,12,3,2
1,business,"[carlyle, look, toward, commercial, aerospace, reuters, reuters, private, investment, firm, carlyle, group, reputation, making, well, timed, occasionally, controversial, play, defense, industry, quietly, placed, bet, another, part, market]",27,5,15,4,3
2,business,"[oil, economy, cloud, stock, outlook, reuters, reuters, soaring, crude, price, plus, worry, economy, outlook, earnings, expected, hang, stock, market, next, week, depth, summer, doldrums]",24,5,17,4,2
3,business,"[iraq, halt, oil, export, main, southern, pipeline, reuters, reuters, authority, halted, oil, export, flow, main, pipeline, southern, iraq, intelligence, showed, rebel, militia, strike, infrastructure, oil, official, said, saturday]",28,3,19,6,3
4,business,"[oil, price, soar, time, record, posing, new, menace, economy, afp, afp, tearaway, world, oil, price, toppling, record, straining, wallet, present, new, economic, menace, barely, three, month, presidential, election]",28,4,16,7,3


# Word Embeddings using Word2Vec algorithm

In [12]:
cores = multiprocessing.cpu_count()

# initializing word2vec model
model = Word2Vec(min_count=20,
                     window=2, # window size for context 
                     vector_size=100,  # no of features 
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

INFO - 22:20:26: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=100, alpha=0.03)', 'datetime': '2021-04-29T22:20:26.365065', 'gensim': '4.0.1', 'python': '3.8.8 (default, Apr 13 2021, 19:58:26) \n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-142-generic-x86_64-with-glibc2.10', 'event': 'created'}


In [13]:
t = time()

# build vocabulary
model.build_vocab(data['Documents'], progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 22:20:26: collecting all words and their counts
INFO - 22:20:26: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 22:20:26: PROGRESS: at sentence #10000, processed 273130 words, keeping 18981 word types
INFO - 22:20:26: PROGRESS: at sentence #20000, processed 542961 words, keeping 26350 word types
INFO - 22:20:26: PROGRESS: at sentence #30000, processed 810845 words, keeping 31364 word types
INFO - 22:20:26: PROGRESS: at sentence #40000, processed 1079542 words, keeping 35363 word types
INFO - 22:20:26: PROGRESS: at sentence #50000, processed 1347719 words, keeping 38795 word types
INFO - 22:20:26: PROGRESS: at sentence #60000, processed 1617217 words, keeping 42012 word types
INFO - 22:20:26: PROGRESS: at sentence #70000, processed 1887970 words, keeping 44832 word types
INFO - 22:20:26: PROGRESS: at sentence #80000, processed 2156362 words, keeping 47460 word types
INFO - 22:20:26: PROGRESS: at sentence #90000, processed 2420763 words, keeping 50103 wor

Time to build vocab: 0.01 mins


In [14]:
t = time()

#train word2vec model 
model.train(data['Documents'], total_examples=model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 22:20:27: Word2Vec lifecycle event {'msg': 'training model with 7 workers on 12033 vocabulary and 100 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2', 'datetime': '2021-04-29T22:20:27.209792', 'gensim': '4.0.1', 'python': '3.8.8 (default, Apr 13 2021, 19:58:26) \n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-142-generic-x86_64-with-glibc2.10', 'event': 'train'}
INFO - 22:20:28: EPOCH 1 - PROGRESS: at 49.07% examples, 972523 words/s, in_qsize 13, out_qsize 0
INFO - 22:20:29: worker thread finished; awaiting finish of 6 more threads
INFO - 22:20:29: worker thread finished; awaiting finish of 5 more threads
INFO - 22:20:29: worker thread finished; awaiting finish of 4 more threads
INFO - 22:20:29: EPOCH 1 - PROGRESS: at 99.24% examples, 976533 words/s, in_qsize 3, out_qsize 1
INFO - 22:20:29: worker thread finished; awaiting finish of 3 more threads
INFO - 22:20:29: worker thread finished; awaiting finish of 2 more threads
INFO - 22:20:29: worker thread finished; awaiting fi

INFO - 22:20:48: worker thread finished; awaiting finish of 3 more threads
INFO - 22:20:48: worker thread finished; awaiting finish of 2 more threads
INFO - 22:20:48: worker thread finished; awaiting finish of 1 more threads
INFO - 22:20:48: worker thread finished; awaiting finish of 0 more threads
INFO - 22:20:48: EPOCH - 11 : training on 3420491 raw words (1975058 effective words) took 1.8s, 1072665 effective words/s
INFO - 22:20:49: EPOCH 12 - PROGRESS: at 54.25% examples, 1077895 words/s, in_qsize 13, out_qsize 0
INFO - 22:20:50: worker thread finished; awaiting finish of 6 more threads
INFO - 22:20:50: worker thread finished; awaiting finish of 5 more threads
INFO - 22:20:50: worker thread finished; awaiting finish of 4 more threads
INFO - 22:20:50: worker thread finished; awaiting finish of 3 more threads
INFO - 22:20:50: worker thread finished; awaiting finish of 2 more threads
INFO - 22:20:50: worker thread finished; awaiting finish of 1 more threads
INFO - 22:20:50: worker thr

INFO - 22:21:08: worker thread finished; awaiting finish of 3 more threads
INFO - 22:21:08: worker thread finished; awaiting finish of 2 more threads
INFO - 22:21:08: worker thread finished; awaiting finish of 1 more threads
INFO - 22:21:08: worker thread finished; awaiting finish of 0 more threads
INFO - 22:21:08: EPOCH - 22 : training on 3420491 raw words (1974343 effective words) took 1.8s, 1087540 effective words/s
INFO - 22:21:09: EPOCH 23 - PROGRESS: at 55.13% examples, 1091738 words/s, in_qsize 13, out_qsize 0
INFO - 22:21:10: worker thread finished; awaiting finish of 6 more threads
INFO - 22:21:10: worker thread finished; awaiting finish of 5 more threads
INFO - 22:21:10: worker thread finished; awaiting finish of 4 more threads
INFO - 22:21:10: worker thread finished; awaiting finish of 3 more threads
INFO - 22:21:10: worker thread finished; awaiting finish of 2 more threads
INFO - 22:21:10: worker thread finished; awaiting finish of 1 more threads
INFO - 22:21:10: worker thr

Time to train the model: 0.93 mins


In [15]:
# save model
model.save('model.bin')

INFO - 22:21:22: Word2Vec lifecycle event {'fname_or_handle': 'model.bin', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-04-29T22:21:22.924489', 'gensim': '4.0.1', 'python': '3.8.8 (default, Apr 13 2021, 19:58:26) \n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-142-generic-x86_64-with-glibc2.10', 'event': 'saving'}
INFO - 22:21:22: not storing attribute cum_table
INFO - 22:21:22: saved model.bin


# Feature Extraction (word2vec)

Base line number of features = 100

In [16]:
def make_feature_vec(words, model, num_features):
    """
    Average the word vectors for a set of words
    """
    feature_vec = np.zeros((num_features,),dtype="float32")  # pre-initialize (for speed)
    nwords = 0.
    index2word_set = set(model.wv.index_to_key)  # words known to the model

    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            feature_vec = np.add(feature_vec,model.wv[word])
    
    feature_vec = np.divide(feature_vec, nwords)
    return feature_vec


def get_avg_feature_vecs(words, model, num_features):
    """
    Calculate average feature vectors for all headlines 
    """
    counter = 0
    feature_vecs = np.zeros((len(words),num_features), dtype='float32')  # pre-initialize (for speed)
    
    for word in words:
        feature_vecs[counter] = make_feature_vec(word, model, num_features)
        counter = counter + 1
    return feature_vecs

In [17]:
word2vec = get_avg_feature_vecs(data['Documents'], model, 100)

In [18]:
# remove instances in test set that could not be represented as feature vectors
nan_indices = list({x for x,y in np.argwhere(np.isnan(word2vec))})
if len(nan_indices) > 0:
    print('Removing {:d} instances from test set.'.format(len(nan_indices)))
    word2vec = np.delete(word2vec, nan_indices, axis=0)
    word2vec.drop(data.iloc[nan_indices, :].index, axis=0, inplace=True)
    assert word2vec.shape[0] == len(data)

In [19]:
w2v = pd.DataFrame(word2vec)

data.reset_index(drop=True, inplace=True)
w2v.reset_index(drop=True, inplace=True)
#df = pd.concat([df1, df2], axis=1)
w2v = pd.concat([data[['News Category','Word Count','Noun Phrases','Noun Count',
                                         'Adjective Count','Verb Count']],w2v],axis=1)

w2v

Unnamed: 0,News Category,Word Count,Noun Phrases,Noun Count,Adjective Count,Verb Count,0,1,2,3,...,90,91,92,93,94,95,96,97,98,99
0,business,18,4,12,3,2,0.153226,0.207765,0.341556,0.509732,...,0.548532,-0.113774,0.057782,-0.063844,0.389871,-0.095498,-0.000756,-0.519580,-0.109446,-0.590428
1,business,27,5,15,4,3,0.354360,-0.169752,0.193214,0.258507,...,-0.112172,-0.229674,-0.244061,-0.241459,0.255334,-0.564091,-0.093670,-0.030404,-0.096065,-0.333639
2,business,24,5,17,4,2,0.557191,-0.672038,-0.211227,0.216557,...,0.458479,-0.680095,-0.661161,-0.101276,0.374314,0.456764,0.762010,-0.061789,0.723553,-0.662003
3,business,28,3,19,6,3,-0.862505,-0.846272,-0.091839,-0.208935,...,0.306875,-1.146071,-0.945447,-0.358710,0.520460,0.908946,0.307853,-0.044196,0.347091,-0.568198
4,business,28,4,16,7,3,0.197830,-0.712709,-0.188731,0.008694,...,0.394509,-0.347257,-0.364584,-0.188155,0.747880,0.359722,0.396012,0.213882,0.414900,-0.355160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127595,world,19,3,9,4,5,-0.261548,-0.682700,-0.191530,-0.016413,...,-0.206953,-0.604427,0.371688,-1.071627,-0.040156,-0.400833,-0.510795,-0.125467,-0.329108,0.591055
127596,sports,41,7,17,9,10,-0.191443,0.313274,-0.067272,0.118658,...,0.428284,-0.462002,-0.543505,0.149100,0.179041,-0.142714,-0.342634,-0.062155,0.161093,-0.413196
127597,sports,20,5,9,3,3,-0.109909,0.097675,0.284736,0.043807,...,0.510660,-0.542429,-0.230180,-0.069108,0.047248,0.005046,-0.374561,0.317467,0.332940,-0.634162
127598,business,21,4,10,4,4,-0.971470,-1.010627,-0.840526,0.260250,...,-0.594328,-0.323036,-0.563703,-0.511324,-0.014788,-0.490354,0.064403,0.324032,0.332317,0.136097


In [20]:
# X dataframe 
X = w2v.drop(['News Category'],axis=1) 
# y series
y = w2v['News Category']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

# Classifications Models

# Random Forest Classifier

In [21]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [22]:
from sklearn.ensemble import RandomForestClassifier

# Fit a random forest to the training data, using 100 trees
forest = RandomForestClassifier(n_estimators = 100)

start_time = time()

print("Fitting a random forest to labeled training data...")
#forest = forest.fit(trainDataVecs, train_words['News Category'])
forest = forest.fit(X_train, y_train)

rf_tt = round((time() - start_time) / 60, 2)

print('Time to train Random Forest Model: {} mins'.format(rf_tt))

Fitting a random forest to labeled training data...
Time to train Random Forest Model: 1.64 mins


In [23]:
# start_time = time()

# rf_result = forest.predict(X_test)

# rf_pt = round((time() - start_time) / 60, 2)

# print('Time taken for label prediction using Random Forest model: {} mins'.format(rf_pt))

In [24]:
# rf = round(accuracy_score(y_test, rf_result)*100,3)

In [25]:
# print(classification_report(y_test, rf_result))

In [26]:
# from sklearn.metrics import  confusion_matrix
# import seaborn as sns

# sns.set(font_scale=1.2)
# cof=confusion_matrix(y_test, rf_result)
# cof=pd.DataFrame(cof, index=[i for i in range(1,5)], columns=[i for i in range(1,5)])
# plt.figure(figsize=(7,7))

# sns.heatmap(cof, cmap="BuPu",linewidths=1, annot=True,square=True,cbar=False,fmt='d',
#             xticklabels=['World','Sports','Business','Science'],
#             yticklabels=['World','Sports','Business','Science'])
# plt.xlabel("\nPredicted Class");
# plt.ylabel("Actual Class\n");
# # plt.savefig('W2V Confusion Matrix for News Article Classification using Random Forest.png', bbox_inches='tight')
# plt.title("\nConfusion Matrix for News Article Classification using Random Forest\n");

# Naive Bayes Classsifier

In [27]:
# train a Gaussian Naive Bayes classifier on the training set
from sklearn.naive_bayes import GaussianNB


# instantiate the model
gnb = GaussianNB()

print("Fitting a Naive Bayes to labeled training data...")

start_time = time()

# fit the model
gnb.fit(X_train, y_train)

nb_tt = round((time() - start_time) / 60, 2)

print('Time to train Naive Bayes Model: {} mins'.format(nb_tt))

Fitting a Naive Bayes to labeled training data...
Time to train Naive Bayes Model: 0.0 mins


In [28]:
# start_time = time()

# nb_result = gnb.predict(X_test)

# nb_pt = round((time() - start_time) / 60, 2)

# print('Time taken for label prediction using Naive Bayes model: {} mins'.format(nb_pt))

In [29]:
# nb = round(accuracy_score(y_test, nb_result)*100,3)

In [30]:
# print(classification_report(y_test, nb_result))

In [31]:
# sns.set(font_scale=1.2)
# cof=confusion_matrix(y_test, nb_result)
# cof=pd.DataFrame(cof, index=[i for i in range(1,5)], columns=[i for i in range(1,5)])
# plt.figure(figsize=(7,7))

# sns.heatmap(cof, cmap="BuPu",linewidths=1, annot=True,square=True,cbar=False,fmt='d',
#             xticklabels=['World','Sports','Business','Science'],
#             yticklabels=['World','Sports','Business','Science'])
# plt.xlabel("\nPredicted Class");
# plt.ylabel("Actual Class\n");
# # plt.savefig('W2V Confusion Matrix for News Article Classification using Naive Bayes.png', bbox_inches='tight')
# plt.title("\nConfusion Matrix for News Article Classification using Naive Bayes\n");

# Logistic Regression Classifier

In [32]:
from sklearn.linear_model import LogisticRegression

In [33]:
logit = LogisticRegression()

print("Fitting a Logistic Regression model to labeled training data...")

start_time = time()

logit.fit(X_train,y_train)

lr_tt = round((time() - start_time) / 60, 2)

print('Time to train Logistic Regression Model: {} mins'.format(lr_tt))


Fitting a Logistic Regression model to labeled training data...
Time to train Logistic Regression Model: 0.05 mins


In [34]:
# start_time = time()

# lr_result = logit.predict(X_test)

# lr_pt = round((time() - start_time) / 60, 2)

# print('Time taken for label prediction using Logistic Regression model: {} mins'.format(lr_pt))

In [35]:
# lr = round(accuracy_score(y_test, lr_result)*100,3)

In [36]:
# sns.set(font_scale=1.2)
# cof=confusion_matrix(y_test, lr_result)
# cof=pd.DataFrame(cof, index=[i for i in range(1,5)], columns=[i for i in range(1,5)])
# plt.figure(figsize=(7,7))

# sns.heatmap(cof, cmap="BuPu",linewidths=1, annot=True,square=True,cbar=False,fmt='d',
#             xticklabels=['World','Sports','Business','Science'],
#             yticklabels=['World','Sports','Business','Science'])
# plt.xlabel("\nPredicted Class");
# plt.ylabel("Actual Class\n");
# # plt.savefig('W2V Confusion Matrix for News Article Classification using Logistic Regression.png', bbox_inches='tight')
# plt.title("\nConfusion Matrix for News Article Classification using Logistic Regression\n");

# SVM Classifier

In [37]:
from sklearn.svm import SVC

In [38]:
svc = SVC()

print("Fitting a SVM model to labeled training data...")

start_time = time()

svc.fit(X_train,y_train)

svm_tt = round((time() - start_time) / 60, 2)

print('Time to train SVM mode: {} mins'.format(svm_tt))

Fitting a SVM model to labeled training data...
Time to train SVM mode: 5.02 mins


In [39]:
# start_time = time()

# svm_result = svc.predict(X_test)

# svm_pt = round((time() - start_time) / 60, 2)

# print('Time taken for label prediction using SVM model: {} mins'.format(svm_pt))

In [40]:
# svm = round(accuracy_score(y_test, svm_result)*100,3)

In [41]:
# sns.set(font_scale=1.2)
# cof=confusion_matrix(y_test, svm_result)
# cof=pd.DataFrame(cof, index=[i for i in range(1,5)], columns=[i for i in range(1,5)])
# plt.figure(figsize=(7,7))

# sns.heatmap(cof, cmap="BuPu",linewidths=1, annot=True,square=True,cbar=False,fmt='d',
#             xticklabels=['World','Sports','Business','Science'],
#             yticklabels=['World','Sports','Business','Science'])
# plt.xlabel("\nPredicted Class");
# plt.ylabel("Actual Class\n");
# # plt.savefig('W2V Confusion Matrix for News Article Classification using SVM.png', bbox_inches='tight')
# plt.title("\nConfusion Matrix for News Article Classification using SVM\n");

# Comparison of models (Accuracy)

In [42]:
# sns.set(font_scale=1.2)
# fig = plt.figure()
# ax = fig.add_axes([0,0,1,1])
# Models = ['RandomForest', 'GaussianNB', 'Logistic', 'SVM']
# Accuracy=[rf,nb,lr,svm]
# ax.bar(Models,Accuracy,color=["lightgreen","pink", "skyblue", "lightyellow"]);
# for i in ax.patches:
#     ax.text(i.get_x()+.1, i.get_height()-5.5, str(round(i.get_height(),2))+'%', fontsize=15, color='black')
# plt.title('Comparison of Classification Models trained using word2vec + grammatical + syntax features \n');
# plt.ylabel('Accuracy\n');
# plt.xlabel('\nClassification Models');
# # plt.savefig('W2V Comparison of Classification Models trained using word2vec + grammatical + syntax features.png', bbox_inches='tight')
# plt.show();

# Comparison of models (Training Time)

In [43]:
# sns.set(font_scale=1.2)
# fig = plt.figure()
# ax = fig.add_axes([0,0,1,1])
# Models = ['RandomForest', 'GaussianNB', 'Logistic', 'SVM']
# TrainingTime=[rf_tt,nb_tt,lr_tt,svm_tt]
# ax.bar(Models,TrainingTime,color=["lightyellow","lightgreen", "skyblue", "pink"]);
# for i in ax.patches:
#     ax.text(i.get_x()+.2, i.get_height(), str(round(i.get_height(),2)), fontsize=14, color='black')
# plt.title('Comparison of Different Classification Models Training Time \n');
# plt.ylabel('Training Time (in mins)\n');
# plt.xlabel('\nClassification Models');
# # plt.savefig('W2V Comparison of Different Classification Models Training Time.png', bbox_inches='tight')
# plt.show();

# Comparison of models (Label Classification Time)

In [44]:
# sns.set(font_scale=1.2)
# fig = plt.figure()
# ax = fig.add_axes([0,0,1,1])
# Models = ['RandomForest', 'GaussianNB', 'Logistic', 'SVM']
# PredictTime=[rf_pt,nb_pt,lr_pt,svm_pt]
# ax.bar(Models,PredictTime,color=["lightyellow","lightgreen", "skyblue", "pink"]);
# for i in ax.patches:
#     ax.text(i.get_x()+.2, i.get_height(), str(round(i.get_height(),2)), fontsize=14, color='black')
# plt.title('Comparison of Different Classification Models Label Classification Time \n');
# plt.ylabel('Label Classification Time (in mins)\n');
# plt.xlabel('\nClassification Models');
# # plt.savefig('W2V Comparison of Different Classification Models Label Classification Time.png', bbox_inches='tight')
# plt.show();

# Model Training & Evaluation using Cross-Validation approach

Logistic Regression

In [45]:
# lr = LogisticRegression(solver='newton-cg')

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
# lr_acc = []

# for train_index, vaild_index in skf.split(X_train,y_train):
#     x_t, x_v = X_train.iloc[train_index], X_train.iloc[vaild_index]
#     y_t, y_v = y_train.iloc[train_index], y_train.iloc[vaild_index]
#     lr.fit(x_t, y_t)
#     lr_acc.append(lr.score(x_v, y_v))

In [46]:
# print(lr_acc)

In [47]:
# print(np.mean(lr_acc))

Random Forest 

In [48]:
# rf = RandomForestClassifier()

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
# rf_acc = []

# for train_index, vaild_index in skf.split(X_train,y_train):
#     x_t, x_v = X_train.iloc[train_index], X_train.iloc[vaild_index]
#     y_t, y_v = y_train.iloc[train_index], y_train.iloc[vaild_index]
#     rf.fit(x_t, y_t)
#     rf_acc.append(rf.score(x_v, y_v))

In [49]:
# print(rf_acc)

In [50]:
# print(np.mean(rf_acc))

Naive Bayes

In [51]:
# gnb = GaussianNB()

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
# gnb_acc = []

# for train_index, vaild_index in skf.split(X_train,y_train):
#     x_t, x_v = X_train.iloc[train_index], X_train.iloc[vaild_index]
#     y_t, y_v = y_train.iloc[train_index], y_train.iloc[vaild_index]
#     gnb.fit(x_t, y_t)
#     gnb_acc.append(gnb.score(x_v, y_v))

In [52]:
# print(gnb_acc)

In [53]:
# print(np.mean(gnb_acc))

SVM

In [54]:
# svc = SVC()

# skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)
# svc_acc = []

# for train_index, vaild_index in skf.split(X_train,y_train):
#     x_t, x_v = X_train.iloc[train_index], X_train.iloc[vaild_index]
#     y_t, y_v = y_train.iloc[train_index], y_train.iloc[vaild_index]
#     svc.fit(x_t, y_t)
#     svc_acc.append(svc.score(x_v, y_v))

In [55]:
# print(svc_acc)

In [56]:
# print(np.mean(svc_acc))

Model Evaluation

In [57]:
# from sklearn.preprocessing import OneHotEncoder

# y_test = y_test.to_numpy()
# y_pred = lr.predict(X_test)

# lr_acc = accuracy_score(y_test,y_pred)
# lr_recall = recall_score(y_test,y_pred,average='macro')
# lr_precision = precision_score(y_test,y_pred,average='macro')
# lr_f1 = f1_score(y_test,y_pred,average='macro')

# y_pred_roc = OneHotEncoder().fit_transform(y_pred.reshape(-1, 1)).toarray()
# y_test_roc = OneHotEncoder().fit_transform(y_test.reshape(-1, 1)).toarray()
# lr_roc = roc_auc_score(y_test_roc,y_pred_roc,multi_class='ovo')

In [58]:
# y_pred = gnb.predict(X_test)

# gnb_acc = accuracy_score(y_test,y_pred)
# gnb_recall = recall_score(y_test_roc,y_pred_roc,average='macro')
# gnb_precision = precision_score(y_test_roc,y_pred_roc,average='macro')
# gnb_f1 = f1_score(y_test_roc,y_pred_roc,average='macro')

# y_pred_roc = OneHotEncoder().fit_transform(y_pred.reshape(-1, 1)).toarray()
# y_test_roc = OneHotEncoder().fit_transform(y_test.reshape(-1, 1)).toarray()
# gnb_roc = roc_auc_score(y_test_roc,y_pred_roc,multi_class='ovo')
# # The multiclass and multilabel cases expect a shape (n_samples, n_classes)

In [59]:
# y_pred = svc.predict(X_test)

# svc_acc = accuracy_score(y_test,y_pred)
# svc_recall = recall_score(y_test,y_pred,average='macro')
# svc_precision = precision_score(y_test,y_pred,average='macro')
# svc_f1 = f1_score(y_test,y_pred,average='macro')

# y_pred_roc = OneHotEncoder().fit_transform(y_pred.reshape(-1, 1)).toarray()
# y_test_roc = OneHotEncoder().fit_transform(y_test.reshape(-1, 1)).toarray()
# svc_roc = roc_auc_score(y_test_roc,y_pred_roc,multi_class='ovo')

In [60]:
# y_pred = rf.predict(X_test)

# rf_acc = accuracy_score(y_test,y_pred)
# rf_recall = recall_score(y_test,y_pred,average='macro')
# rf_precision = precision_score(y_test,y_pred,average='macro')
# rf_f1 = f1_score(y_test,y_pred,average='macro')

# y_pred_roc = OneHotEncoder().fit_transform(y_pred.reshape(-1, 1)).toarray()
# y_test_roc = OneHotEncoder().fit_transform(y_test.reshape(-1, 1)).toarray()
# rf_roc = roc_auc_score(y_test_roc,y_pred_roc,multi_class='ovo')

In [61]:
# print("Logistic Regression: ",lr_acc,lr_recall,lr_precision,lr_f1,lr_roc)
# print("Naive Bayes: ", gnb_acc,gnb_recall,gnb_precision,gnb_f1,gnb_roc)
# print("Support Vector Machine: ", svc_acc,svc_recall,svc_precision,svc_f1,svc_roc)
# print("Random Forest: ", rf_acc,rf_recall,rf_precision,rf_f1,rf_roc)

# Results

In [62]:
# results = pd.DataFrame([[lr_acc,lr_recall,lr_precision,lr_f1,lr_roc],
#                         [gnb_acc,gnb_recall,gnb_precision,gnb_f1,gnb_roc],
#                         [svc_acc,svc_recall,svc_precision,svc_f1,svc_roc],
#                         [rf_acc,rf_recall,rf_precision,rf_f1,rf_roc]],
#                        columns=['accuracy','recall','precision','fl-score','roc_auc'], 
#                        index=["Logistic Regression","Naive Bayes","Support Vector Machine","Random Forest"])

In [63]:
# results

In [64]:
# results.to_csv('word2vec_results.csv')

# Import Testing Data & W2V Feature Extraction

In [166]:
data = pd.read_csv('cleaned_bing.csv')
data.head()

Unnamed: 0,News Category,Documents,Word Count,Noun Phrases,Noun Count,Adjective Count,Verb Count
0,sports,free agent damontae kazee visit detroit lionyear salary cap mismanagement new atlanta falcon general manager terry fontenot choice purge roster let good player leave via free agency saturday former,29,4,15,8,5
1,sports,desean jackson agrees deal ramdesean jackson will always apart philadelphia history he heading home finish career according mike garafalo former eagle secondround pick signing los,25,6,10,6,5
2,sports,rocket coach stephen silas heartbreaking press conference 20th straight losshouston rocket case havent noticed now lost stunning twenty straight game falling home oklahoma city thunder one hundred fourteen thousand one hundred twelve sunday team last win came way back,39,6,19,5,8
3,sports,rice choreographs murray st past tennessee state three thousand five hundred thirteenpreston rice threw pair touchdown ran another murray state used fourth quarter pull away tennessee state three thousand five hundred thirteen win sunday murray state entered game,38,7,20,5,7
4,sports,celtic trade rumor aaron gordon sought multiple team including boston reportceltic got perhaps important extended look one intriguing team ahead thursday’s trade deadline boston downed orlando eleven thousand two hundred ninety six sunday td garden win,37,5,18,6,6


In [167]:
data['News Category'] = data['News Category'].replace("science","science_and_technology")
data['Documents'] = data['Documents'].str.split(" ")
data.head()

Unnamed: 0,News Category,Documents,Word Count,Noun Phrases,Noun Count,Adjective Count,Verb Count
0,sports,"[free, agent, damontae, kazee, visit, detroit, lionyear, salary, cap, mismanagement, new, atlanta, falcon, general, manager, terry, fontenot, choice, purge, roster, let, good, player, leave, via, free, agency, saturday, former]",29,4,15,8,5
1,sports,"[desean, jackson, agrees, deal, ramdesean, jackson, will, always, apart, philadelphia, history, he, heading, home, finish, career, according, mike, garafalo, former, eagle, secondround, pick, signing, los]",25,6,10,6,5
2,sports,"[rocket, coach, stephen, silas, heartbreaking, press, conference, 20th, straight, losshouston, rocket, case, havent, noticed, now, lost, stunning, twenty, straight, game, falling, home, oklahoma, city, thunder, one, hundred, fourteen, thousand, one, hundred, twelve, sunday, team, last, win, came, way, back]",39,6,19,5,8
3,sports,"[rice, choreographs, murray, st, past, tennessee, state, three, thousand, five, hundred, thirteenpreston, rice, threw, pair, touchdown, ran, another, murray, state, used, fourth, quarter, pull, away, tennessee, state, three, thousand, five, hundred, thirteen, win, sunday, murray, state, entered, game]",38,7,20,5,7
4,sports,"[celtic, trade, rumor, aaron, gordon, sought, multiple, team, including, boston, reportceltic, got, perhaps, important, extended, look, one, intriguing, team, ahead, thursday’s, trade, deadline, boston, downed, orlando, eleven, thousand, two, hundred, ninety, six, sunday, td, garden, win]",37,5,18,6,6


In [169]:
cores = multiprocessing.cpu_count()

# initializing word2vec model
model = Word2Vec(min_count=20,
                     window=2, # window size for context 
                     vector_size=100,  # no of features 
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

INFO - 23:46:06: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=100, alpha=0.03)', 'datetime': '2021-04-29T23:46:06.702436', 'gensim': '4.0.1', 'python': '3.8.8 (default, Apr 13 2021, 19:58:26) \n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-142-generic-x86_64-with-glibc2.10', 'event': 'created'}


In [170]:
t = time()

# build vocabulary
model.build_vocab(data['Documents'], progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 23:46:08: collecting all words and their counts
INFO - 23:46:08: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 23:46:08: collected 2145 word types from a corpus of 4002 raw words and 144 sentences
INFO - 23:46:08: Creating a fresh vocabulary
INFO - 23:46:08: Word2Vec lifecycle event {'msg': 'effective_min_count=20 retains 7 unique words (0.32634032634032634%% of original 2145, drops 2138)', 'datetime': '2021-04-29T23:46:08.348985', 'gensim': '4.0.1', 'python': '3.8.8 (default, Apr 13 2021, 19:58:26) \n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-142-generic-x86_64-with-glibc2.10', 'event': 'prepare_vocab'}
INFO - 23:46:08: Word2Vec lifecycle event {'msg': 'effective_min_count=20 leaves 250 word corpus (6.246876561719141%% of original 4002, drops 3752)', 'datetime': '2021-04-29T23:46:08.349300', 'gensim': '4.0.1', 'python': '3.8.8 (default, Apr 13 2021, 19:58:26) \n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-142-generic-x86_64-with-glibc2.10', 'event': 'prepare_

Time to build vocab: 0.0 mins


In [171]:
t = time()

#train word2vec model 
model.train(data['Documents'], total_examples=model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 23:46:08: Word2Vec lifecycle event {'msg': 'training model with 7 workers on 7 vocabulary and 100 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2', 'datetime': '2021-04-29T23:46:08.881851', 'gensim': '4.0.1', 'python': '3.8.8 (default, Apr 13 2021, 19:58:26) \n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-142-generic-x86_64-with-glibc2.10', 'event': 'train'}
INFO - 23:46:08: worker thread finished; awaiting finish of 6 more threads
INFO - 23:46:08: worker thread finished; awaiting finish of 5 more threads
INFO - 23:46:08: worker thread finished; awaiting finish of 4 more threads
INFO - 23:46:08: worker thread finished; awaiting finish of 3 more threads
INFO - 23:46:08: worker thread finished; awaiting finish of 2 more threads
INFO - 23:46:08: worker thread finished; awaiting finish of 1 more threads
INFO - 23:46:08: worker thread finished; awaiting finish of 0 more threads
INFO - 23:46:08: EPOCH - 1 : training on 4002 raw words (5 effective words) took 0.0s, 1819 effective

INFO - 23:46:08: worker thread finished; awaiting finish of 3 more threads
INFO - 23:46:08: worker thread finished; awaiting finish of 2 more threads
INFO - 23:46:08: worker thread finished; awaiting finish of 1 more threads
INFO - 23:46:08: worker thread finished; awaiting finish of 0 more threads
INFO - 23:46:08: EPOCH - 13 : training on 4002 raw words (5 effective words) took 0.0s, 1644 effective words/s
INFO - 23:46:08: worker thread finished; awaiting finish of 6 more threads
INFO - 23:46:08: worker thread finished; awaiting finish of 5 more threads
INFO - 23:46:08: worker thread finished; awaiting finish of 4 more threads
INFO - 23:46:08: worker thread finished; awaiting finish of 3 more threads
INFO - 23:46:08: worker thread finished; awaiting finish of 2 more threads
INFO - 23:46:08: worker thread finished; awaiting finish of 1 more threads
INFO - 23:46:08: worker thread finished; awaiting finish of 0 more threads
INFO - 23:46:08: EPOCH - 14 : training on 4002 raw words (3 effe

INFO - 23:46:09: worker thread finished; awaiting finish of 3 more threads
INFO - 23:46:09: worker thread finished; awaiting finish of 2 more threads
INFO - 23:46:09: worker thread finished; awaiting finish of 1 more threads
INFO - 23:46:09: worker thread finished; awaiting finish of 0 more threads
INFO - 23:46:09: EPOCH - 26 : training on 4002 raw words (5 effective words) took 0.0s, 1841 effective words/s
INFO - 23:46:09: worker thread finished; awaiting finish of 6 more threads
INFO - 23:46:09: worker thread finished; awaiting finish of 5 more threads
INFO - 23:46:09: worker thread finished; awaiting finish of 4 more threads
INFO - 23:46:09: worker thread finished; awaiting finish of 3 more threads
INFO - 23:46:09: worker thread finished; awaiting finish of 2 more threads
INFO - 23:46:09: worker thread finished; awaiting finish of 1 more threads
INFO - 23:46:09: worker thread finished; awaiting finish of 0 more threads
INFO - 23:46:09: EPOCH - 27 : training on 4002 raw words (7 effe

Time to train the model: 0.0 mins


In [172]:
def make_feature_vec(words, model, num_features):
    """
    Average the word vectors for a set of words
    """
    feature_vec = np.zeros((num_features,),dtype="float32")  # pre-initialize (for speed)
    nwords = 0.
    index2word_set = set(model.wv.index_to_key)  # words known to the model

    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            feature_vec = np.add(feature_vec,model.wv[word])
    
    feature_vec = np.divide(feature_vec, nwords)
    return feature_vec


def get_avg_feature_vecs(words, model, num_features):
    """
    Calculate average feature vectors for all headlines 
    """
    counter = 0
    feature_vecs = np.zeros((len(words),num_features), dtype='float32')  # pre-initialize (for speed)
    
    for word in words:
        feature_vecs[counter] = make_feature_vec(word, model, num_features)
        counter = counter + 1
    return feature_vecs

In [173]:
word2vec = get_avg_feature_vecs(data['Documents'], model, 100)

In [174]:
w2v = pd.DataFrame(word2vec)

data.reset_index(drop=True, inplace=True)
w2v.reset_index(drop=True, inplace=True)
w2v = pd.concat([data[['News Category','Word Count','Noun Phrases','Noun Count',
                                         'Adjective Count','Verb Count']],w2v],axis=1)

w2v.dropna(subset=w2v.columns,inplace=True)
w2v.head

<bound method NDFrame.head of               News Category  Word Count  Noun Phrases  Noun Count  \
1                    sports          25             6          10   
2                    sports          39             6          19   
3                    sports          38             7          20   
4                    sports          37             5          18   
6                    sports          33             5          14   
..                      ...         ...           ...         ...   
132  science_and_technology          28             4          17   
133  science_and_technology          27             7          11   
137  science_and_technology          23             5          12   
140  science_and_technology          30             5          18   
143  science_and_technology          35             5          18   

     Adjective Count  Verb Count         0         1         2         3  ...  \
1                  6           5 -0.008727  0.002130 -0.0008

In [175]:
# X dataframe 
X_test = w2v.drop(['News Category'],axis=1) 
# y series
y_test = w2v['News Category']

In [176]:
y_test

1                      sports
2                      sports
3                      sports
4                      sports
6                      sports
                ...          
132    science_and_technology
133    science_and_technology
137    science_and_technology
140    science_and_technology
143    science_and_technology
Name: News Category, Length: 87, dtype: object

Model Evaluation

In [177]:
from sklearn.preprocessing import OneHotEncoder

y_test = y_test.to_numpy()
y_pred = logit.predict(X_test)

lr_acc = accuracy_score(y_test,y_pred)
lr_recall = recall_score(y_test,y_pred,average='macro')
lr_precision = precision_score(y_test,y_pred,average='macro')
lr_f1 = f1_score(y_test,y_pred,average='macro')

y_pred_roc = OneHotEncoder().fit(y_test.reshape(-1, 1)).transform(y_pred.reshape(-1,1)).toarray()
y_test_roc = OneHotEncoder().fit_transform(y_test.reshape(-1, 1)).toarray()
lr_roc = roc_auc_score(y_test_roc,y_pred_roc,multi_class='ovo')

In [178]:
y_pred = gnb.predict(X_test)

gnb_acc = accuracy_score(y_test,y_pred)
gnb_recall = recall_score(y_test,y_pred,average='macro')
gnb_precision = precision_score(y_test,y_pred,average='macro')
gnb_f1 = f1_score(y_test,y_pred,average='macro')

y_pred_roc = OneHotEncoder().fit(y_test.reshape(-1, 1)).transform(y_pred.reshape(-1,1)).toarray()
y_test_roc = OneHotEncoder().fit_transform(y_test.reshape(-1, 1)).toarray()
gnb_roc = roc_auc_score(y_test_roc,y_pred_roc,multi_class='ovo')
# The multiclass and multilabel cases expect a shape (n_samples, n_classes)

In [179]:
y_pred = svc.predict(X_test)

svc_acc = accuracy_score(y_test,y_pred)
svc_recall = recall_score(y_test,y_pred,average='macro')
svc_precision = precision_score(y_test,y_pred,average='macro')
svc_f1 = f1_score(y_test,y_pred,average='macro')

y_pred_roc = OneHotEncoder().fit(y_test.reshape(-1, 1)).transform(y_pred.reshape(-1,1)).toarray()
y_test_roc = OneHotEncoder().fit_transform(y_test.reshape(-1, 1)).toarray()
svc_roc = roc_auc_score(y_test_roc,y_pred_roc,multi_class='ovo')

In [180]:
y_pred = forest.predict(X_test)

rf_acc = accuracy_score(y_test,y_pred)
rf_recall = recall_score(y_test,y_pred,average='macro')
rf_precision = precision_score(y_test,y_pred,average='macro')
rf_f1 = f1_score(y_test,y_pred,average='macro')

y_pred_roc = OneHotEncoder().fit(y_test.reshape(-1, 1)).transform(y_pred.reshape(-1,1)).toarray()
y_test_roc = OneHotEncoder().fit_transform(y_test.reshape(-1, 1)).toarray()
rf_roc = roc_auc_score(y_test_roc,y_pred_roc,multi_class='ovo')

In [181]:
print("Logistic Regression: ",lr_acc,lr_recall,lr_precision,lr_f1,lr_roc)
print("Naive Bayes: ", gnb_acc,gnb_recall,gnb_precision,gnb_f1,gnb_roc)
print("Support Vector Machine: ", svc_acc,svc_recall,svc_precision,svc_f1,svc_roc)
print("Random Forest: ", rf_acc,rf_recall,rf_precision,rf_f1,rf_roc)

Logistic Regression:  0.16091954022988506 0.20588235294117646 0.04487179487179487 0.0736842105263158 0.47258403361344536
Naive Bayes:  0.25287356321839083 0.20915032679738563 0.09083333333333334 0.12018255578093306 0.47243230625583565
Support Vector Machine:  0.20689655172413793 0.2647058823529412 0.11313291139240506 0.12333333333333334 0.5091386554621848
Random Forest:  0.19540229885057472 0.25 0.04885057471264368 0.08173076923076923 0.5


# Results

In [182]:
results = pd.DataFrame([[lr_acc,lr_recall,lr_precision,lr_f1,lr_roc],
                        [gnb_acc,gnb_recall,gnb_precision,gnb_f1,gnb_roc],
                        [svc_acc,svc_recall,svc_precision,svc_f1,svc_roc],
                        [rf_acc,rf_recall,rf_precision,rf_f1,rf_roc]],
                       columns=['accuracy','recall','precision','fl-score','roc_auc'], 
                       index=["Logistic Regression","Naive Bayes","Support Vector Machine","Random Forest"])

In [183]:
results

Unnamed: 0,accuracy,recall,precision,fl-score,roc_auc
Logistic Regression,0.16092,0.205882,0.044872,0.073684,0.472584
Naive Bayes,0.252874,0.20915,0.090833,0.120183,0.472432
Support Vector Machine,0.206897,0.264706,0.113133,0.123333,0.509139
Random Forest,0.195402,0.25,0.048851,0.081731,0.5


In [184]:
results.to_csv('word2vec_bing_results.csv')