<a href="https://colab.research.google.com/github/romapavelko01/NLP_SDLC_project/blob/classifications/OtherNB_for_all.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

In [3]:
filename = "/content/drive/MyDrive/SDLC/news_analysis_project/data/final_news_category_dataset.json"
df = pd.read_json(filename, orient='split')
df.head(3)

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26


## Text preprocessing

In [4]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
import re
import string


def cleaning_function(sentence):
    """
    Function needed to perform data preprocessing: removing punctuation symbols,
    stop_words and other random things in order to obtain clean text
    """
    # the following line removes numbers from text
    result = re.sub(r'\d+', '', sentence.lower())

    # the following line removes any punctuation from the text
    result = result.translate(str.maketrans('','',string.punctuation))
    return [word for word in result.split() if not word in stop_words]


def collect_words(lst):
    """
    Given a list of lists of words,
    return a list of all the words in a list representation of the text.
    """
    return [word for sentence in lst for word in sentence]

In [6]:
df_1 = df.copy()
df_1['processed_description'] = df_1['short_description'].apply(lambda x: ' '.join(cleaning_function(x)))
df_1 = df_1[['category', 'processed_description']]
df_1.head()

Unnamed: 0,category,processed_description
0,CRIME,left husband killed children another day america
1,ENTERTAINMENT,course song
2,ENTERTAINMENT,actor longtime girlfriend anna eberstein tied ...
3,ENTERTAINMENT,actor gives dems asskicking fighting hard enou...
4,ENTERTAINMENT,dietland actress said using bags really cathar...


In [7]:
head_df = df.copy()
head_df['processed_headline'] = head_df['headline'].apply(lambda x: ' '.join(cleaning_function(x)))
head_df = head_df[['category', 'processed_headline']]
head_df.head()

Unnamed: 0,category,processed_headline
0,CRIME,mass shootings texas last week tv
1,ENTERTAINMENT,smith joins diplo nicky jam world cups officia...
2,ENTERTAINMENT,hugh grant marries first time age
3,ENTERTAINMENT,jim carrey blasts castrato adam schiff democra...
4,ENTERTAINMENT,julianna margulies uses donald trump poop bags...


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB, ComplementNB, BernoulliNB, CategoricalNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Train and test datasets for classification based on preprocessed description

In [9]:
cv_u_d = CountVectorizer()
X_u_d = cv_u_d.fit_transform(df_1.processed_description)

X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_u_d, df_1['category'].values, test_size=0.2)

In [10]:
# X_train_d.toarray()

## Train and test datasets for classification based on preprocessed headline

In [11]:
cv_u_h = CountVectorizer()
X_u_h = cv_u_h.fit_transform(head_df.processed_headline)

X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(X_u_h, head_df['category'].values, test_size=0.2)

# GaussianNB

It is said to deal with continuous features so it might make sense to use tfidf vectorizer

## Description

In [15]:
gnb_d = GaussianNB()
gnb_d.fit(X_train_d, y_train_d)
y_pred_d = gnb_d.predict(X_test_d)

accuracy_score(y_test_d, y_pred_d)

TypeError: ignored

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_d = TfidfVectorizer()
X_tf_d = tf_d.fit_transform(df_1.processed_description)

X_train_tf_d, X_test_tf_d, y_train_tf_d, y_test_tf_d = train_test_split(X_tf_d, df_1['category'].values, test_size=0.2)

gnb_d_tfidf = GaussianNB()
gnb_d_tfidf.fit(X_train_tf_d, y_train_tf_d)
y_pred_gnb_tfidf_d = gnb_d_tfidf.predict(X_test_tf_d)

accuracy_score(y_test_tf_d, y_pred_gnb_tfidf_d)

TypeError: ignored

In [16]:
X_train_d

<160682x88702 sparse matrix of type '<class 'numpy.int64'>'
	with 1683468 stored elements in Compressed Sparse Row format>

*X_train.toarray()* causes RAM crash in colab

## Headline

# ComplementNB (said to be suitable for imbalanced datasets)

## Description

In [13]:
cmpl_d = ComplementNB()
cmpl_d.fit(X_train_d, y_train_d)
y_pred_compl_d = cmpl_d.predict(X_test_d)

accuracy_score(y_test_d, y_pred_compl_d)

0.4192078862861268

## Headline

In [14]:
cmpl_h = ComplementNB()
cmpl_h.fit(X_train_h, y_train_h)
y_pred_compl_h = cmpl_h.predict(X_test_h)

accuracy_score(y_test_h, y_pred_compl_h)

0.558985337681412

# BernoulliNB

## Description

In [18]:
bern_d = BernoulliNB()
bern_d.fit(X_train_d, y_train_d)
y_pred_bern_d = bern_d.predict(X_test_d)

accuracy_score(y_test_d, y_pred_bern_d)

0.3372582211047771

## Headline

In [19]:
bern_h = BernoulliNB()
bern_h.fit(X_train_h, y_train_h)
y_pred_bern_h = bern_d.predict(X_test_h)

accuracy_score(y_test_h, y_pred_bern_h)

ValueError: ignored

# CategoricalNB

## Description

In [21]:
cat_d = CategoricalNB()
cat_d.fit(X_train_d, y_train_d)
y_pred_cat_d = cat_d.predict(X_test_d)

accuracy_score(y_test_d, y_pred_cat_d)

TypeError: ignored

## Headline

# Parameter tuning for ComplementNB

In [23]:
ComplementNB().get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True, 'norm': False}

In [25]:
from sklearn.model_selection import GridSearchCV

params = {'alpha': [0.01, 0.1, 0.5, 1, 5],
          'fit_prior': [True, False],
          'norm': [True, False]}

## For description

In [26]:
cmpl_tuned_d = GridSearchCV(ComplementNB(), params, n_jobs=-1)
cmpl_tuned_d.fit(X_train_d, y_train_d)

GridSearchCV(estimator=ComplementNB(), n_jobs=-1,
             param_grid={'alpha': [0.01, 0.1, 0.5, 1, 5],
                         'fit_prior': [True, False], 'norm': [True, False]})

In [28]:
cmpl_tuned_d.best_params_

{'alpha': 1, 'fit_prior': True, 'norm': False}

In [29]:
y_pred_cmpl_tuned_d = cmpl_tuned_d.predict(X_test_d)
accuracy_score(y_test_d, y_pred_cmpl_tuned_d)
# X_train_d, X_test_d, y_train_d, y_test_d

0.4192078862861268

## For headline

In [30]:
cmpl_tuned_h = GridSearchCV(ComplementNB(), params, n_jobs=-1)
cmpl_tuned_h.fit(X_train_h, y_train_h)

GridSearchCV(estimator=ComplementNB(), n_jobs=-1,
             param_grid={'alpha': [0.01, 0.1, 0.5, 1, 5],
                         'fit_prior': [True, False], 'norm': [True, False]})

In [31]:
cmpl_tuned_h.best_params_

{'alpha': 1, 'fit_prior': True, 'norm': False}

In [32]:
y_pred_cmpl_tuned_h = cmpl_tuned_h.predict(X_test_h)
accuracy_score(y_test_h, y_pred_cmpl_tuned_h)

0.558985337681412