<a href="https://colab.research.google.com/github/romapavelko01/NLP_SDLC_project/blob/classifications/classification_for_top_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

Reading data

In [3]:
os.chdir('/content/drive/MyDrive/SDLC/news_analysis_project')

In [4]:
df = pd.read_json("data/final_news_category_dataset.json", orient="split")
df.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


# Preprocessing 

In [5]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
import re
import string


def cleaning_function(sentence):
    """
    Function needed to perform data preprocessing: removing punctuation symbols,
    stop_words and other random things in order to obtain clean text
    """
    # the following line removes numbers from text
    result = re.sub(r'\d+', '', sentence.lower())

    # the following line removes any punctuation from the text
    result = result.translate(str.maketrans('','',string.punctuation))
    return [word for word in result.split() if not word in stop_words]

In [7]:
df['processed_short_description'] = df['short_description'].apply(lambda x: ' '.join(cleaning_function(x)))
df['processed_headline'] = df['headline'].apply(lambda x: ' '.join(cleaning_function(x)))
df['full_text'] = df['headline'] + df['short_description']
df['processed_full_text'] = df['processed_headline'] + df['processed_short_description']

# Creating a smaller dataset with only top 10 categories included

In [8]:
df.category.value_counts(normalize=True)

POLITICS          0.163000
WELLNESS          0.088756
ENTERTAINMENT     0.079949
TRAVEL            0.049225
STYLE & BEAUTY    0.048040
PARENTING         0.043201
HEALTHY LIVING    0.033328
QUEER VOICES      0.031436
FOOD & DRINK      0.030998
BUSINESS          0.029559
COMEDY            0.025765
SPORTS            0.024316
BLACK VOICES      0.022544
HOME & LIVING     0.020886
PARENTS           0.019691
THE WORLDPOST     0.018242
WEDDINGS          0.018177
WOMEN             0.017376
IMPACT            0.017222
DIVORCE           0.017057
CRIME             0.016953
MEDIA             0.014015
WEIRD NEWS        0.013293
GREEN             0.013054
WORLDPOST         0.012840
RELIGION          0.012726
STYLE             0.011222
SCIENCE           0.010844
WORLD NEWS        0.010839
TASTE             0.010435
TECH              0.010366
MONEY             0.008499
ARTS              0.007513
FIFTY             0.006975
GOOD NEWS         0.006960
ARTS & CULTURE    0.006667
ENVIRONMENT       0.006587
C

In [9]:
small_df = df[df.category.isin(df.category.value_counts().index[:5])]
small_df.head(3)



Unnamed: 0,category,headline,authors,link,short_description,date,processed_short_description,processed_headline,full_text,processed_full_text
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26,course song,smith joins diplo nicky jam world cups officia...,Will Smith Joins Diplo And Nicky Jam For The 2...,smith joins diplo nicky jam world cups officia...
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26,actor longtime girlfriend anna eberstein tied ...,hugh grant marries first time age,Hugh Grant Marries For The First Time At Age 5...,hugh grant marries first time ageactor longtim...
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26,actor gives dems asskicking fighting hard enou...,jim carrey blasts castrato adam schiff democra...,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,jim carrey blasts castrato adam schiff democra...


In [10]:
print("The length of the smaller dataframe with only top 5 categories included: ", len(small_df))

The length of the smaller dataframe with only top 5 categories included:  86160


In [11]:
small_df.category.value_counts(normalize=True)

POLITICS          0.379979
WELLNESS          0.206906
ENTERTAINMENT     0.186374
TRAVEL            0.114752
STYLE & BEAUTY    0.111989
Name: category, dtype: float64

# Training models

For each model two types of vectorizers will be provided, TfidfVectorizer and CountVectorizer, with 3 possible ngram options - cosidering single words, Unigrams, single words and word pairs, Unigrams&Bigrams, and word pairs only - Bigrams

## Importing itertools library to generate parameter combinations - tuples of parameters in the form (classifier, vectorizer used, ngram - Unigrams/Bigrams/Combined, what the classification was based on - headline/short_description/combined, whether it was preprocessed, top k most frequent features to consider) for model training

In [12]:
import itertools

## Splitting the dataset into train/test

In [13]:
X_train, X_test, y_train, y_test = train_test_split(small_df.loc[:, small_df.columns != 'category'], small_df['category'], 
                                                    test_size=0.2, stratify=small_df.category,
                                                    random_state=1)

## Helper functions to train a model

In [14]:
def train_model(clf, vect, ngram, by, preprocessed, topk):
    """
    Returns pd.DataFrame - with train and test results
    """
    res_df = pd.DataFrame(columns=['Classifier', 'By', 'Preprocessed', 
                                   'Vectorizer', 'Ngram', 'TopKFeatures', 
                                   'TrainAccuracy', 'TestAccuracy'])
    data_to_use = ('processed_' if preprocessed else '') + by
    X_train_curr, X_test_curr = X_train[data_to_use], X_test[data_to_use]

    vectorizer = vect(ngram_range=ngram)
    x_train_ = vectorizer.fit_transform(X_train_curr)

    # Vectorize validation texts.
    x_val = vectorizer.transform(X_test_curr)
    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(topk, x_train_.shape[1]))
    selector.fit(x_train_, y_train)
    x_train = selector.transform(x_train_).astype('float32')
    x_val = selector.transform(x_val).astype('float32')   

    clf.fit(x_train, y_train)
    y_pred_test = clf.predict(x_val)
    y_pred_train = clf.predict(x_train)
    train_acc, test_acc = accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred_test)
    return pd.DataFrame({
        'Classifier': [clf.__class__.__name__],
        'By': [by],
        'Preprocessed': [preprocessed],
        'Vectorizer': [vectorizer.__class__.__name__],
        'Ngram': [ngram],
        'TopKFeatures': [topk],
        'TrainAccuracy': [train_acc],
        'TestAccuracy': [test_acc]
    })


def train_model_combinations(combinations):
    res_model_comb_df = pd.DataFrame(columns=['Classifier', 'By', 'Preprocessed', 
                                   'Vectorizer', 'Ngram', 'TopKFeatures', 
                                   'TrainAccuracy', 'TestAccuracy'])
    for comb in combinations:  # for tuple - (model, vectorizer, ngram, by, preprocessed, topkfeatures)
        curr_dict = dict(zip(("clf", "vect", "ngram", "by", "preprocessed", "topk"), (comb)))
        res_model_comb_df = res_model_comb_df.append(train_model(**curr_dict))
    return res_model_comb_df

## Models

In [15]:
vects = [CountVectorizer, TfidfVectorizer]
ngrams = [(1, 1), (1, 2), (2, 2)]
bys = ['full_text', 'headline', 'short_description']
preprocessed = [0, 1]
topk = list(range(3000, 7501, 1500))

### LinearSVC

In [21]:
clfs_1 = [LinearSVC(max_iter=2500)]
linearSVC_combs = itertools.product(clfs_1, vects, ngrams, bys, preprocessed, topk)
linear_svc_df = train_model_combinations(linearSVC_combs)



In [25]:
linear_svc_df.sort_values(by='TestAccuracy', ascending=False).to_csv('results_for_top_5categories/linearSVC_top5results.csv')


### PassiveAggressiveClassifier

In [17]:
clfs_2 = [PassiveAggressiveClassifier(max_iter=2500, n_jobs=-1)]
PAC_combs = itertools.product(clfs_2, vects, ngrams, bys, preprocessed, topk)
PAC_df = train_model_combinations(PAC_combs)

In [18]:
PAC_df.sort_values(by='TestAccuracy', ascending=False).to_csv('results_for_top_5categories/PAC_top5results.csv')

### SGDClassifier

In [19]:
clfs_3 = [SGDClassifier(max_iter=2500, n_jobs=-1)]
SGD_combs = itertools.product(clfs_3, vects, ngrams, bys, preprocessed, topk)
SGD_df = train_model_combinations(SGD_combs)


In [20]:
SGD_df.sort_values(by='TestAccuracy', ascending=False).to_csv('results_for_top_5categories/SGD_top5results.csv')

### XGBClassifier

In [21]:
clfs_4 = [XGBClassifier(n_jobs=-1)]
XGB_combs = itertools.product(clfs_4, vects, ngrams, bys, preprocessed, topk)
XGB_df = train_model_combinations(XGB_combs)

In [22]:
XGB_df.sort_values(by='TestAccuracy', ascending=False).to_csv('results_for_top_5categories/XGB_top5results.csv')

### RandomForestClassifier

In [23]:
clfs_5 = [RandomForestClassifier(n_jobs=-1)]
RFC_combs = itertools.product(clfs_5, vects, ngrams, bys, preprocessed, topk)
RFC_df = train_model_combinations(RFC_combs)

In [24]:
RFC_df.sort_values(by='TestAccuracy', ascending=False).to_csv('results_for_top_5categories/RFC_top5results.csv')

### MultinomialNB

In [25]:
clfs_6 = [MultinomialNB()]
MNB_combs = itertools.product(clfs_6, vects, ngrams, bys, preprocessed, topk)
MNB_df = train_model_combinations(MNB_combs)

In [26]:
MNB_df.sort_values(by='TestAccuracy', ascending=False).to_csv('results_for_top_5categories/MNB_top5results.csv')

### ComplementNB

In [27]:
clfs_7 = [ComplementNB()]
CNB_combs = itertools.product(clfs_7, vects, ngrams, bys, preprocessed, topk)
CNB_df = train_model_combinations(CNB_combs)

In [28]:
CNB_df.sort_values(by='TestAccuracy', ascending=False).to_csv('results_for_top_5categories/CNB_top5results.csv')