**Mounting GDrive**

In [16]:
# Mounting the drive
from google.colab import drive
drive.mount("/content/drive", force_remount=True) 

Mounted at /content/drive


**Setting up Project Directories**

In [137]:
# Setting folder path of the project/data files
data_path = 'drive/My Drive/Training resources/Dip in AL ML/project-71/' 

**Importing Necessary Libraries**

In [138]:
# Importing necessary packages
# numpy, pandas for handling data
import numpy as np
import pandas as pd

# For handling data
import scipy

# For Plotting Charts - matplotlib, seaborn, plotly
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import seaborn as sns

# Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, AdaBoostClassifier, BaggingRegressor, GradientBoostingClassifier,BaggingClassifier
from sklearn.naive_bayes import GaussianNB

# Tfidf and other packages
from sklearn import preprocessing, model_selection, feature_extraction, feature_selection, metrics, manifold, naive_bayes, pipeline
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, AdaBoostClassifier, BaggingRegressor, GradientBoostingClassifier,BaggingClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics.pairwise import linear_kernel

# Performance metrics
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, make_scorer, roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import svm
from sklearn.model_selection import ShuffleSplit

#Saving the models into files using joblibs
import joblib


**Reading Analysed data from the file**



In [139]:
# Reading the Data file (stored after data analysis and cleaning) in CSV format and storing the data into the Dataframe
df = pd.read_csv('drive/My Drive/Training resources/Dip in AL ML/project-71/news_datafame.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200840 entries, 0 to 200839
Data columns (total 24 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   category                      200840 non-null  object 
 1   headline                      200834 non-null  object 
 2   authors                       164233 non-null  object 
 3   link                          200840 non-null  object 
 4   short_description             181128 non-null  object 
 5   date                          200840 non-null  object 
 6   clean_category                200840 non-null  object 
 7   clean_category_id             200840 non-null  int64  
 8   clean_link                    200840 non-null  object 
 9   clean_authors                 200840 non-null  object 
 10  headline_word_count           200840 non-null  int64  
 11  short_description_word_count  200840 non-null  int64  
 12  headline_char_count           200840 non-nul

**Method to retrieve Category Name based on Category Id**

In [140]:
# get_catagory_name method returns the category name based on the Category_id.  This is used to show the classification report 
# based on Category name instead of category id

df_catagory_mapping=df.drop_duplicates(["clean_category_id", "clean_category"])[["clean_category_id", "clean_category"]]

def get_catagory_name(cat_id):
  return df_catagory_mapping.loc[df_catagory_mapping['clean_category_id'] == cat_id, 'clean_category'].iloc[0]

get_catagory_name(5)

'WEIRD NEWS'

**Error Analysis - Creating Features from errors**

Error Analysis helps to improve the performance of the model.  Errors can be converted as features.  We follow the below steps for generating features from error analysis

*   Step - 1: As this is a multi class classifier problem, the data is split into 2 categories.  Category - 1 represents the category with highest count of documents.  In this case, it is POLITICS.  Category - 0 represents other data
*   Step - 2:  Apply Logistic Regression for the entire dataset and find the High proba & low proba indexes for correct & wrong predictions and create features(Label 1 to 4) based on that
*   Step - 3:  Apply Logistic Regression again, find the error features and add these features to the original dataframe.
*   Step - 4:  Apply the model with the new error features and see if there is any improvement in the accuracy.  In this case, there is an increase of 3% in the accuracy




In [141]:
###############################################################################
###########  ERROR ANALYSIS - FEATURES FROM ERRORS  ###########################
###############################################################################
## To get the features from Error.  Adding a new column in the Dataframe - binary_category
#  As this is multi class classification, splitting the data into 2 categories for error analysis
#  1 - The category that has more counts - POLITICS will be considered as 1
#  0 - Others will be considered as 0

df["binary_category"] = 0
df.loc[df['clean_category'] == 'POLITICS', 'binary_category'] = 1

print(df.binary_category.value_counts())

# Parameter selection
ngram_range = (1,3)
min_df = 10
max_df = 1.
max_features = 10000

# Forming feature list using clean_news_text, clean_link, and clean_authors data and applying the regression model
vectorizer_whole_news = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)

vectorizer_whole_news.fit_transform(df["clean_news_text"].values.astype('U'))
X_whole_news_vect = vectorizer_whole_news.transform(df["clean_news_text"].values.astype('U'))

vectorizer_whole_link = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
vectorizer_whole_link.fit_transform(df["clean_link"].values.astype('U'))
X_whole_link_vect = vectorizer_whole_link.transform(df["clean_link"].values.astype('U'))

vectorizer_whole_authors = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
vectorizer_whole_authors.fit_transform(df["clean_authors"].values.astype('U'))
X_whole_authors_vect = vectorizer_whole_authors.transform(df["clean_authors"].values.astype('U'))

# Combining TFIDF features of clean_news_text, clean_lin and clean_author fields
X_whole_vect = scipy.sparse.hstack([X_whole_news_vect, X_whole_link_vect, X_whole_authors_vect])

# Fitting predictive model to the data
y = df["binary_category"].values.astype('U')
err_model = LogisticRegression().fit(X_whole_vect, y)
LogisticRegression()

# Predictions
y_pred = err_model.predict(X_whole_vect)
y_pred_proba = err_model.predict_proba(X_whole_vect)

# Predicted Probabilities for Class 0 and 1
y_pred_proba[:10]

# Create Dataframe of only predictions
df_pred = pd.DataFrame().assign(y = df["binary_category"].values.astype('U'), y_pred = y_pred, y_proba = y_pred_proba[:, 1])

df_pred.head()

from sklearn.metrics import accuracy_score

print("accuracy ", accuracy_score(df_pred.y, df_pred.y_pred))

# Obtain required Indexes (Wrong and Correct predictions)
idxs_correct = df_pred[df_pred.y == df_pred.y_pred].index
idxs_wrong = df_pred[df_pred.y != df_pred.y_pred].index

len(idxs_correct), len(idxs_wrong)
df_pred_correct = df_pred.iloc[idxs_correct]

# High proba & low proba indexes for correct & wrong predictions:
idxs_correct_high = df_pred_correct[df_pred_correct.y_proba > 0.5].index
idxs_correct_low = df_pred_correct[df_pred_correct.y_proba <= 0.5].index

df_pred_wrong = df_pred.iloc[idxs_wrong]
idxs_wrong_high = df_pred_wrong[df_pred_wrong.y_proba > 0.5].index
idxs_wrong_low = df_pred_wrong[df_pred_wrong.y_proba <= 0.5].index

[len(i) for i in [idxs_correct_high, idxs_correct_low, idxs_wrong_high, idxs_wrong_low]]

# Creating new labels based on error info:
df_correct_high = df.iloc[idxs_correct_high, :-1].assign(label = [0 for i in range(len(idxs_correct_high))])
df_correct_low = df.iloc[idxs_correct_low, :-1].assign(label = [1 for i in range(len(idxs_correct_low))])
df_wrong_high = df.iloc[idxs_wrong_high, :-1].assign(label = [2 for i in range(len(idxs_wrong_high))])
df_wrong_low = df.iloc[idxs_wrong_low, :-1].assign(label = [3 for i in range(len(idxs_wrong_low))])
df_correct_high.shape, df_correct_low.shape, df_wrong_high.shape, df_wrong_low.shape

df_error_labels = pd.concat([df_correct_high, df_correct_low, df_wrong_high, df_wrong_low])
#print(df_error_labels.sample(10))

print(df_error_labels.label.value_counts())

# Training new model on error labels:
X_error = df_error_labels.iloc[:, :-1].values
y_error = df_error_labels.iloc[:, -1].values

print(X_error)
print(y_error)
error_model = LogisticRegression().fit(X_whole_vect, y_error)
#print(error_model)

# Four new feats obained, which are to be added to the original data:
error_feats = error_model.predict_proba(X_whole_vect)
#error_feats

print("error features shape ", error_feats.shape, df.shape)

columns1 = ['err_feat-' + str(i + 0) for i in range(1, 5)]
df_error_feats = pd.DataFrame(error_feats, columns = columns1)

#df_error_feats
df = pd.concat([df.iloc[:, :-1], df_error_feats], axis = 1).assign(y = y).round(3)
df.info()

X_whole_vect = scipy.sparse.hstack([X_whole_news_vect, X_whole_link_vect, X_whole_authors_vect, np.array(df["err_feat-1"])[:,None], np.array(df["err_feat-2"])[:,None], np.array(df["err_feat-3"])[:,None], np.array(df["err_feat-4"])[:,None]])



0    168102
1     32738
Name: binary_category, dtype: int64


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


accuracy  0.9612178848834894
1    165909
0     27142
3      5596
2      2193
Name: label, dtype: int64
[['POLITICS'
  "Trump's Crackdown On Immigrant Parents Puts More Kids In An Already Strained System"
  'Elise Foley and Roque Planas' ... 5.289473684210527 19.0 0.02]
 ['POLITICS'
  "'Trump's Son Should Be Concerned': FBI Obtained Wiretaps Of Putin Ally Who Met With Trump Jr."
  'Michael Isikoff, Yahoo News' ... 5.935483870967742 10.333333333333334
  0.0]
 ['POLITICS'
  "Edward Snowden: There's No One Trump Loves More Than Vladimir Putin"
  'Mary Papenfuss' ... 5.0 12.0 0.5]
 ...
 ['POLITICS' 'Why Climate Change Deniers Are Winning' nan ...
  4.714285714285714 7.0 0.5]
 ['POLITICS' 'Axelrod Has A Big New Gig' nan ... 2.857142857142857 7.0
  0.0681818181818181]
 ['POLITICS' 'Dear President Obama'
  'Robin Amos Kahn, ContributorWriter, Speaker, Lead Coach at Own the Room'
  ... 4.787878787878788 11.0 0.25]]
[0 0 0 ... 3 3 3]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


error features shape  (200840, 4) (200840, 25)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200840 entries, 0 to 200839
Data columns (total 29 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   category                      200840 non-null  object 
 1   headline                      200834 non-null  object 
 2   authors                       164233 non-null  object 
 3   link                          200840 non-null  object 
 4   short_description             181128 non-null  object 
 5   date                          200840 non-null  object 
 6   clean_category                200840 non-null  object 
 7   clean_category_id             200840 non-null  int64  
 8   clean_link                    200840 non-null  object 
 9   clean_authors                 200840 non-null  object 
 10  headline_word_count           200840 non-null  int64  
 11  short_description_word_count  200840 non-null  int64  
 1

**Extracting Train and Test Data**

*    The news data is highly imbalanced and it contains 200,840 documents.  We take 25% of the total data for train and test purpose.  Out of the train and test data, 80% is used for training and the remaining 20% is used for testing.  

*    In machine learning, When we want to train our ML model we split our entire dataset into training set and test set using train_test_split() class present in sklearn.  Then we train our model on training_set and test our model on test_set. This will split the data randomly and the train/test data do not represent the entire data set. This will cause inaccuracy of the models.  To avoid this Stratified sampling is used.  Stratified sample represents the entire dataset in equal proportion.  **StratifiedKFold:** This cross-validation object is a variation of KFold that returns stratified folds. The folds are made by preserving the percentage of samples for each class. KFold: Split dataset into k consecutive folds. StratifiedKFold is used when is need to balance of percentage each class in train & test 

*    However StratifiedKFold provides a way to split the entire dataset.  But we need to apply StratifiedKFold for the 25 of the data.  **To achieve this, we used the Group by function with filters.**  The data is grouped based on category labels and 25% of the data is taken from each category.  

In [142]:
#################################################################################
############### Extracting TRAIN AND TEST DATA ##################################
#################################################################################
#Copying index column as groupby creates multi index
df['index1'] = df.index

#dfa = df.loc[df['clean_category'].isin(['POLITICS', 'ENTERTAINMENT', 'WELLNESS'])]
dfa = df

# Grouping the data by clean_category
grouped = dfa.groupby('clean_category', group_keys = True)

# Taking 25% from each category by using sample function.  The final output will have the around 50K rows
# It includes both Train and test data
# Stratified sampling aims at splitting a data set so that each split is similar with respect to category.
trainandtest = grouped.apply(lambda x: x.sample(frac=0.25, replace=False))
print ("train and test data ")
print(trainandtest.clean_category.value_counts())

# Taking 10% data from Train and test data which is around 10K
test = trainandtest.apply(lambda x: x.sample(frac=0.2, replace=False))

#  Taking 90% from Train and test data which is around 40K
df_train = trainandtest.loc[~trainandtest['index1'].isin(test['index1'])]
print ("train data ")
print(df_train.clean_category.value_counts())

# Taking 10% data from Train and test data which is around 5K
df_test = trainandtest.loc[trainandtest['index1'].isin(test['index1'])]
print ("test data ")
print(df_test.clean_category.value_counts())
df_test.head()


train and test data 
POLITICS          8184
WELLNESS          4456
ENTERTAINMENT     4014
PARENTING         3158
STYLE & BEAUTY    2975
TRAVEL            2472
WORLDPOST         2105
FOOD & DRINK      2080
HEALTHY LIVING    1674
QUEER VOICES      1578
BUSINESS          1484
COMEDY            1294
SPORTS            1221
BLACK VOICES      1132
SCIENCE & TECH    1064
HOME & LIVING     1049
ARTS & CULTURE     970
WEDDINGS           913
WOMEN              872
IMPACT             865
DIVORCE            856
CRIME              851
MEDIA              704
WEIRD NEWS         668
GREEN              656
RELIGION           639
EDUCATION          537
MONEY              427
GOOD NEWS          350
FIFTY              350
ENVIRONMENT        330
LATINO VOICES      282
Name: clean_category, dtype: int64
train data 
POLITICS          6539
WELLNESS          3584
ENTERTAINMENT     3206
PARENTING         2545
STYLE & BEAUTY    2381
TRAVEL            1972
WORLDPOST         1696
FOOD & DRINK      1688
HEALTHY LIVI

Unnamed: 0_level_0,Unnamed: 1_level_0,category,headline,authors,link,short_description,date,clean_category,clean_category_id,clean_link,clean_authors,...,sentence_count,avg_word_length,avg_sentence_lenght,sentiment,err_feat-1,err_feat-2,err_feat-3,err_feat-4,y,index1
clean_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ARTS & CULTURE,141819,CULTURE & ARTS,"Totem, Cirque du Soleil, San Pedro, CA","James Scarborough, Contributor\nWriter",https://www.huffingtonpost.com/entry/totem-cir...,If you could amend the Nobel Prize charter to ...,2013-10-20,ARTS & CULTURE,19,entry totem cirque du soleil sa us 5bb26746e4...,James_Scarborough,...,2,4.68,25.0,0.083,0.01,0.962,0.012,0.016,0,141819
ARTS & CULTURE,107405,ARTS,Stage Door: Wiesenthal,"Fern Siegel, ContributorDeputy Editor, MediaPost",https://www.huffingtonpost.com/entry/stage-doo...,"Dugan's 90-minute play is heartfelt, deeply mo...",2014-11-06,ARTS & CULTURE,19,entry stage door wiesenthal b 6116670.html,Fern_Siegel,...,4,6.442,10.75,0.106,0.001,0.986,0.008,0.005,0,107405
ARTS & CULTURE,61816,ARTS,Adrian Lester: Best Robert in Stephen Sondheim...,"Courtney M. Soliday, Contributor",https://www.huffingtonpost.com/entry/the-best-...,"Phone rings, door chimes, in comes Stephen Son...",2016-04-10,ARTS & CULTURE,19,entry the best bobby in sondhei b 9653102.html,Courtney_M._Soliday,...,2,5.593,13.5,0.75,0.0,0.993,0.001,0.005,0,61816
ARTS & CULTURE,42849,ARTS & CULTURE,New Yorkers Dismayed At Election Results Can S...,Rebecca Shapiro,https://www.huffingtonpost.com/entry/new-yorke...,Hundreds have shared their feelings on Post-it...,2016-11-10,ARTS & CULTURE,19,entry new yorkers dismayed by election result...,Rebecca_Shapiro,...,2,5.579,9.5,0.136,0.199,0.8,0.001,0.001,0,42849
ARTS & CULTURE,36340,ARTS & CULTURE,You’ll Want To Read This Scorching Satire Of '...,Maddie Crum,https://www.huffingtonpost.com/entry/kate-zamb...,The Midwest is a warped fairy tale in our Book...,2017-01-24,ARTS & CULTURE,19,entry kate zambreno o fallen angel us 588675b...,Maddie_Crum,...,2,4.348,11.5,0.0,0.056,0.923,0.009,0.013,0,36340


**Feature Vectors for Models**

The dataset contains headline, description, links, Authors and categories as text features. 

TFIDF is used to get the features for text fields in the dataset.  **Term Frequency-Inverse Document Frequency:** TF-IDF determines how important a word is by weighing its frequency of occurence in the document and computing how often the same word occurs in other documents. If a word occurs many times in a particular document but not in others, then it might be highly relevant to that particular document and is therefore assigned more importance.

We used news_text = headline + short_description, link and author for modeling along with error features which are added based on error analysis (discussed earlier).  **There is more than 10% increase in accuracy by using link, author and error features along with news_text** 

In [143]:
################################################################################ 
#######     FEATURE VECTORS    ################################################
###############################################################################

df_train.info()

# Parameter selection
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 10000

#### Using 3 Vectorizers for building features from news_text, link and authors
vectorizer1 = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)

vectorizer2 = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
vectorizer3 = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
vectorizer1.fit_transform(df_train["clean_news_text"].values.astype('U'))
X_train_news_vect = vectorizer1.transform(df_train["clean_news_text"].values.astype('U'))
X_test_news_vect = vectorizer1.transform(df_test["clean_news_text"].values.astype('U'))

vectorizer2.fit_transform(df_train["clean_link"].values.astype('U'))
X_train_link_vect = vectorizer2.transform(df_train["clean_link"].values.astype('U'))
X_test_link_vect = vectorizer2.transform(df_test["clean_link"].values.astype('U'))

vectorizer3.fit_transform(df_train["clean_authors"].values.astype('U'))
X_train_authors_vect = vectorizer3.transform(df_train["clean_authors"].values.astype('U'))
X_test_authors_vect = vectorizer3.transform(df_test["clean_authors"].values.astype('U'))

# Save the vectorizers as a pickle in files
joblib.dump(vectorizer1, (data_path + "/Models" + '/vectorizer_news_text.pkl'))
joblib.dump(vectorizer2, (data_path + "/Models" + '/vectorizer_link.pkl'))
joblib.dump(vectorizer3, (data_path + "/Models" + '/vectorizer_authors.pkl'))


# Build features for train dataset using scipy.sparse.hstack by concatenating TFIDF vectors for news_text
# link, author and error features 
#X_train_vect = scipy.sparse.hstack([X_train_news_vect, X_train_link_vect, X_train_authors_vect])
X_train_vect = scipy.sparse.hstack([X_train_news_vect
                                    , X_train_link_vect, X_train_authors_vect
                                    , 
                np.array(df_train["err_feat-1"])[:,None], np.array(df_train["err_feat-2"])[:,None], 
                np.array(df_train["err_feat-3"])[:,None], np.array(df_train["err_feat-4"])[:,None]
                , np.array(df_train["sentence_count"][:,None])
                ])

#X_test_vect = scipy.sparse.hstack([X_test_news_vect, X_test_link_vect, X_test_authors_vect])
X_test_vect = scipy.sparse.hstack([X_test_news_vect
                                   , X_test_link_vect, X_test_authors_vect
                 , 
                 np.array(df_test["err_feat-1"])[:,None], np.array(df_test["err_feat-2"])[:,None], 
                 np.array(df_test["err_feat-3"])[:,None], np.array(df_test["err_feat-4"])[:,None]
                 , np.array(df_test["sentence_count"][:,None])
                 ])

y_train = df_train["clean_category_id"]
y_test = df_test["clean_category_id"]

df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 40168 entries, ('ARTS & CULTURE', 27515) to ('WORLDPOST', 102822)
Data columns (total 30 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   category                      40168 non-null  object 
 1   headline                      40166 non-null  object 
 2   authors                       32887 non-null  object 
 3   link                          40168 non-null  object 
 4   short_description             36152 non-null  object 
 5   date                          40168 non-null  object 
 6   clean_category                40168 non-null  object 
 7   clean_category_id             40168 non-null  int64  
 8   clean_link                    40168 non-null  object 
 9   clean_authors                 40168 non-null  object 
 10  headline_word_count           40168 non-null  int64  
 11  short_description_word_count  40168 non-null  int64  
 12  headline_char_count 



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200840 entries, 0 to 200839
Data columns (total 30 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   category                      200840 non-null  object 
 1   headline                      200834 non-null  object 
 2   authors                       164233 non-null  object 
 3   link                          200840 non-null  object 
 4   short_description             181128 non-null  object 
 5   date                          200840 non-null  object 
 6   clean_category                200840 non-null  object 
 7   clean_category_id             200840 non-null  int64  
 8   clean_link                    200840 non-null  object 
 9   clean_authors                 200840 non-null  object 
 10  headline_word_count           200840 non-null  int64  
 11  short_description_word_count  200840 non-null  int64  
 12  headline_char_count           200840 non-nul

**Building & Testing ML Models**

After building Feature vectors, we tried with different machine learning classification models in order to find the best modeld that suits the data.  We will try with the following models:

*   Logistic Regression
*   Multinomial Naïve Bayes
*   Linear SVC
*   Random Forest

The methodology used to train each model is as follows:
1.  Step - 1: Decide the hyperparameters that need to be tuned. Execute the models by changing the feature parameters and find the performance
2.  Step - 2: Define the metrics to be used for measuring the performance of the model
  *   Accuracy
      *   Train Accuracy
      *   Test Accuracy
  *   Precision
  *   Recall
  *   F1 Score
  *   Classification Report (precision, recall, f1-score, support)

The dataset contains the following Categories after cleaning
*   POLITICS          
*   WELLNESS          
*   ENTERTAINMENT     
*   PARENTING         
*   STYLE & BEAUTY    
*   TRAVEL            
*   WORLDPOST         
*   FOOD & DRINK      
*   HEALTHY LIVING    
*   QUEER VOICES      
*   BUSINESS          
*   COMEDY            
*   SPORTS             
*   BLACK VOICES       
*   HOME & LIVING      
*   SCIENCE & TECH     
*   ARTS & CULTURE     
*   WOMEN              
*   WEDDINGS           
*   IMPACT             
*   CRIME              
*   DIVORCE            
*   MEDIA              
*   WEIRD NEWS         
*   GREEN              
*   RELIGION           
*   EDUCATION          
*   MONEY              
*   GOOD NEWS          
*   FIFTY              
*   ENVIRONMENT        
*   LATINO VOICES      

As the data is imbalanced, we used Stratified sampling to get the train and test data.

We used 5 algorithms with ensemble models such as **Logistic Regression, Multinominal Naïve Bayes, Linear SVC, Random Forest, and Logistic Regression GridSearchCV** and compared train accuracy, test accuracy scores, precision, recall, and F1 scores.  For this dataset, we found that **Logistic Regression GridSearchCV** showed the best performance compared to the other classifiers.


In [144]:
# Models
#create list of model and accuracy dicts
import time

perform_list = []

def calculate_performance_metrics(y_true, y_prediction):
    FP = np.logical_and(y_true != y_prediction, y_prediction != -1).sum() 
    FN = np.logical_and(y_true != y_prediction, y_prediction == -1).sum()  
    TP = np.logical_and(y_true == y_prediction, y_true != -1).sum()  
    TN = np.logical_and(y_true == y_prediction, y_true == -1).sum()  

    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP/(TP+FN)
    
    # Specificity or true negative rate
    TNR = TN/(TN+FP) 
    
    # Precision or positive predictive value
    PPV = TP/(TP+FP)
    
    # Negative predictive value
    NPV = TN/(TN+FN)
    
    # Fall out or false positive rate
    FPR = FP/(FP+TN)
    
    # False negative rate
    FNR = FN/(TP+FN)
    
    # False discovery rate
    FDR = FP/(TP+FP)
    
    # Overall accuracy for each class
    ACC = round(((TP+TN)/(TP+FP+FN+TN)), 2)
    print("printing Conf matrix values -->", FP, FN, TP, TN, TPR, TNR, PPV, NPV, FPR, FNR, FDR, ACC)
    return FP, FN, TP, TN, TPR, TNR, PPV, NPV, FPR, FNR, FDR, ACC

def run_model(model_name, est_c, est_pnlty):
    # To measure execution time
    start_time = time.time() 
    
    model=''
    filename = ''
    if model_name == 'Logistic Regression':
        model = LogisticRegression(solver='lbfgs', max_iter=1000)
        filename = 'lr_model.pkl'
    elif model_name == 'Multinomial Naive Bayes':
        model = MultinomialNB()
        filename = 'mnb_model.pkl'
    elif model_name == 'Linear SVC':
        model = LinearSVC()
        filename = 'lsvc_model.pkl'
    elif model_name == 'Random Forest':
        model = RandomForestClassifier(n_estimators=50)
        filename = 'rf_model.pkl'
    elif model_name == 'Logistic Regression GridSearchCV':
        model = LogisticRegression(C=est_c, penalty=est_pnlty, solver='lbfgs', max_iter=2000)      
        filename = 'lr_gsv_model.pkl'
    elif model_name == 'GridSearchCV':
        filename = 'gsv_model.pkl'
        # Create the parameter grid based on the results of random search 
        C = [.0001, .001, .01, .1]
        degree = [3, 4, 5]
        gamma = [1, 10, 100]
        probability = [True]

        param_grid = [
          {'C': C, 'kernel':['linear'], 'probability':probability},
          {'C': C, 'kernel':['poly'], 'degree':degree, 'probability':probability},
          {'C': C, 'kernel':['rbf'], 'gamma':gamma, 'probability':probability}
        ]

        # Create a base model
        svc = svm.SVC(random_state=8)
        cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)

        # Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
        # Instantiate the grid search model
        mdl = GridSearchCV(estimator=svc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

    oneVsRest = OneVsRestClassifier(model)
    oneVsRest.fit(X_train_vect, y_train)
    y_pred = oneVsRest.predict(X_test_vect)
    y_pred_train = oneVsRest.predict(X_train_vect)

    execution_time = time.time() - start_time
    print("Model - Execution time: --- %s seconds ---" % (execution_time))

    # Save the model as a pickle in a file
    joblib.dump(oneVsRest, (data_path + "/Models/" + filename))
        
    # Performance metrics
    accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
    train_accuracy = round(accuracy_score(y_train, y_pred_train) * 100, 2)
    # Get precision, recall, f1 scores
    precision, recall, f1score, support = score(y_test, y_pred, average='micro')

    # Get all performance metrics
    FP, FN, TP, TN, TPR, TNR, PPV, NPV, FPR, FNR, FDR, ACC = calculate_performance_metrics(y_test, y_pred)

    print(f'Train Accuracy Score of Basic {model_name}: % {train_accuracy}')
    print(f'Test Accuracy Score of Basic {model_name}: % {accuracy}')
    print(f'Precision : {precision}')
    print(f'Recall    : {recall}')
    print(f'F1-score   : {f1score}')
    print(metrics.classification_report(y_test, y_pred))

    # Add performance parameters to list
    perform_list.append(dict([
        ('Model', model_name),
        ('Train Accuracy', round(train_accuracy, 2)),
        ('Test Accuracy', round(accuracy, 2)),
        ('Precision', round(precision, 2)),
        ('Recall', round(recall, 2)),
        ('F1', round(f1score, 2)),
        ('Execution Time', round(execution_time, 2)),
        ('FP', FP),
        ('FN', FN),
        ('TP', TP),
        ('TN', TN),
        ('TPR', TPR),
        ('TNR', TNR),
        ('FPR', FPR),
        ('FNR', FNR),
        ('ACC', ACC)
         ]))

**Run Logistic Regression Model**

In [145]:
run_model('Logistic Regression', est_c=None, est_pnlty=None)

Model - Execution time: --- 135.61426997184753 seconds ---
printing Conf matrix values --> 2896 0 7146 0 1.0 0.0 0.711611232822147 nan 1.0 0.0 0.288388767177853 0.71
Train Accuracy Score of Basic Logistic Regression: % 86.04
Test Accuracy Score of Basic Logistic Regression: % 71.16
Precision : 0.711611232822147
Recall    : 0.711611232822147
F1-score   : 0.711611232822147
              precision    recall  f1-score   support

           0       0.59      0.59      0.59       162
           1       0.73      0.79      0.76       808
           2       0.69      0.66      0.67       409
           3       0.49      0.23      0.32       179
           4       0.69      0.91      0.79      1645
           5       0.49      0.56      0.52       116
           6       0.68      0.44      0.53       239
           7       0.66      0.38      0.48       182
           8       0.74      0.61      0.67       259
           9       0.91      0.72      0.81       319
          10       0.79      0.



**Run Multinomial Naive Bayes Model**

In [146]:
run_model('Multinomial Naive Bayes', est_c=None, est_pnlty=None)

Model - Execution time: --- 3.1059834957122803 seconds ---
printing Conf matrix values --> 3745 0 6297 0 1.0 0.0 0.6270663214499104 nan 1.0 0.0 0.37293367855008963 0.63
Train Accuracy Score of Basic Multinomial Naive Bayes: % 69.41
Test Accuracy Score of Basic Multinomial Naive Bayes: % 62.71
Precision : 0.6270663214499104
Recall    : 0.6270663214499104
F1-score   : 0.6270663214499104
              precision    recall  f1-score   support

           0       0.63      0.37      0.47       162
           1       0.60      0.82      0.69       808
           2       0.67      0.59      0.63       409
           3       0.87      0.11      0.20       179
           4       0.59      0.95      0.73      1645
           5       0.69      0.09      0.17       116
           6       0.82      0.27      0.40       239
           7       0.93      0.15      0.26       182
           8       0.77      0.53      0.63       259
           9       0.92      0.58      0.71       319
          10     



**Run Linear SVC**

In [147]:
run_model('Linear SVC', est_c=None, est_pnlty=None)



Model - Execution time: --- 37.77360010147095 seconds ---
printing Conf matrix values --> 3008 0 7034 0 1.0 0.0 0.7004580760804621 nan 1.0 0.0 0.29954192391953793 0.7
Train Accuracy Score of Basic Linear SVC: % 99.79
Test Accuracy Score of Basic Linear SVC: % 70.05
Precision : 0.7004580760804621
Recall    : 0.7004580760804621
F1-score   : 0.7004580760804621
              precision    recall  f1-score   support

           0       0.51      0.58      0.54       162
           1       0.75      0.76      0.75       808
           2       0.65      0.66      0.65       409
           3       0.34      0.28      0.31       179
           4       0.76      0.85      0.80      1645
           5       0.44      0.47      0.45       116
           6       0.61      0.48      0.54       239
           7       0.49      0.40      0.44       182
           8       0.66      0.65      0.65       259
           9       0.84      0.75      0.79       319
          10       0.73      0.70      0.72  



**Run Random Forest Model**

In [148]:
run_model('Random Forest', est_c=None, est_pnlty=None)

Model - Execution time: --- 367.63756942749023 seconds ---
printing Conf matrix values --> 3125 0 6917 0 1.0 0.0 0.6888070105556662 nan 1.0 0.0 0.3111929894443338 0.69
Train Accuracy Score of Basic Random Forest: % 100.0
Test Accuracy Score of Basic Random Forest: % 68.88
Precision : 0.6888070105556662
Recall    : 0.6888070105556662
F1-score   : 0.6888070105556662
              precision    recall  f1-score   support

           0       0.56      0.48      0.52       162
           1       0.74      0.73      0.74       808
           2       0.60      0.61      0.61       409
           3       0.44      0.20      0.28       179
           4       0.71      0.89      0.79      1645
           5       0.49      0.35      0.41       116
           6       0.64      0.46      0.53       239
           7       0.58      0.26      0.36       182
           8       0.65      0.55      0.60       259
           9       0.84      0.80      0.82       319
          10       0.72      0.77     



**Run GridSearchCV Model**



In [149]:
#  Optimization is done for the model using GridCV

param = {'estimator__penalty':['l1', 'l2'], 'estimator__C':[0.001, 0.01, 1, 10]}

opt_mdl = LogisticRegression()
oneVsRest = OneVsRestClassifier(opt_mdl)
oneVsRest.get_params().keys()

# GridSearchCV
kf=KFold(n_splits=10, shuffle=True, random_state=55)
lr_grid = GridSearchCV(oneVsRest, param_grid = param, cv = kf, scoring='f1_micro', n_jobs=-1)
lr_grid.fit(X_train_vect, y_train)
lr_grid.best_params_

run_model('Logistic Regression GridSearchCV',lr_grid.best_params_['estimator__C'],lr_grid.best_params_['estimator__penalty'])


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/multiclass.py", line 347, in fit
    for i, column in enumerate(columns)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.7/

Model - Execution time: --- 286.4218559265137 seconds ---
printing Conf matrix values --> 2914 0 7128 0 1.0 0.0 0.7098187612029476 nan 1.0 0.0 0.2901812387970524 0.71
Train Accuracy Score of Basic Logistic Regression GridSearchCV: % 99.64
Test Accuracy Score of Basic Logistic Regression GridSearchCV: % 70.98
Precision : 0.7098187612029476
Recall    : 0.7098187612029476
F1-score   : 0.7098187612029475
              precision    recall  f1-score   support

           0       0.53      0.57      0.55       162
           1       0.74      0.77      0.76       808
           2       0.66      0.66      0.66       409
           3       0.35      0.27      0.30       179
           4       0.74      0.87      0.80      1645
           5       0.44      0.47      0.45       116
           6       0.66      0.51      0.58       239
           7       0.54      0.42      0.47       182
           8       0.66      0.64      0.65       259
           9       0.85      0.74      0.79       319
 



**Model Performance after Optimization**

For this dataset, we found that **Logistic Regression GridSearchCV** showed the best performance compared to the other classifiers.



In [150]:
model_performance = pd.DataFrame(data=perform_list)
model_performance = model_performance[['Model', "Train Accuracy", 'Test Accuracy', 'Precision', 'Recall', 'F1', 'Execution Time','FP', 'FN', 'TP', 'TN', 'TPR', 'TNR', 'FPR', 'FNR', 'ACC']]
model_performance

Unnamed: 0,Model,Train Accuracy,Test Accuracy,Precision,Recall,F1,Execution Time,FP,FN,TP,TN,TPR,TNR,FPR,FNR,ACC
0,Logistic Regression,86.04,71.16,0.71,0.71,0.71,135.61,2896,0,7146,0,1.0,0.0,1.0,0.0,0.71
1,Multinomial Naive Bayes,69.41,62.71,0.63,0.63,0.63,3.11,3745,0,6297,0,1.0,0.0,1.0,0.0,0.63
2,Linear SVC,99.79,70.05,0.7,0.7,0.7,37.77,3008,0,7034,0,1.0,0.0,1.0,0.0,0.7
3,Random Forest,100.0,68.88,0.69,0.69,0.69,367.64,3125,0,6917,0,1.0,0.0,1.0,0.0,0.69
4,Logistic Regression GridSearchCV,99.64,70.98,0.71,0.71,0.71,286.42,2914,0,7128,0,1.0,0.0,1.0,0.0,0.71
