<a href="https://colab.research.google.com/github/romapavelko01/NLP_SDLC_project/blob/classifications/classification_MB_for_all.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif

In [None]:
filename = "/content/drive/MyDrive/SDLC/news_analysis_project/data/final_news_category_dataset.json"
df = pd.read_json(filename, orient='split')
df.head(3)

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26


## Text preprocessing

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import re
import string


def cleaning_function(sentence):
    """
    Function needed to perform data preprocessing: removing punctuation symbols,
    stop_words and other random things in order to obtain clean text
    """
    # the following line removes numbers from text
    result = re.sub(r'\d+', '', sentence.lower())

    # the following line removes any punctuation from the text
    result = result.translate(str.maketrans('','',string.punctuation))
    return [word for word in result.split() if not word in stop_words]

In [None]:
df_1 = df.copy()
df_1['processed_description'] = df_1['short_description'].apply(lambda x: ' '.join(cleaning_function(x)))
df_1 = df_1[['category', 'processed_description']]
df_1.head()


Unnamed: 0,category,processed_description
0,CRIME,left husband killed children another day america
1,ENTERTAINMENT,course song
2,ENTERTAINMENT,actor longtime girlfriend anna eberstein tied ...
3,ENTERTAINMENT,actor gives dems asskicking fighting hard enou...
4,ENTERTAINMENT,dietland actress said using bags really cathar...


Let's check how many  (roughly, only lowercase) unique words are there in the datasets - processed and raw

In [None]:
all_words_processed = set(np.concatenate((*df_1.processed_description.apply(lambda x: x.split()).values,)))
all_words_raw = set(np.concatenate((*df.short_description.apply(lambda x: x.split()).values,)))
print("Total number of words in the pre-processed dataset: ", len(all_words_processed))
print("Total number of words in the raw dataset: ", len(all_words_raw))

Total number of words in the pre-processed dataset:  95150
Total number of words in the raw dataset:  208227


# Splitting data into train-test with train and test having the same distribution of news categorically

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_1.processed_description, df_1['category'], 
                                                    test_size=0.2, stratify=df_1.category,
                                                    random_state=1)



# Model, making classifications based on short description

## Creating a dataframe to store train and test accuracies of each combination of **vectorizer + (unigrams/bigrams/unigrams&bigrams) + classifier (in this case, MultinomialNB)**

In [None]:
results_df = pd.DataFrame(columns=['Classifier', 'By', 'Preprocessed', 'Vectorizer', 'Ngram', 'TopKFeatures', 'TrainAccuracy', 'TestAccuracy'])

Creating a function to display and record results to the accuracies dataframe for each combination 

In [None]:
def write_df(dataset, preprocessed, by, clf, vect, ngram=(1, 1), topk=6000, display=True):
    """
    Function to calculate accuracies for the given dataset split, 
    whether it is raw or preprocessed; 
    by - in ['headline', 'descirption', 'full_text']
    classifier, 
    vectorizer, ngram parameter, and how many top features to consider;
    record calculated accuracies to the results_df;
    and, once display is set to True, print classification accuracies
    """
    global results_df

    X_train, X_test, y_train, y_test = dataset

    vectorizer = vect(ngram_range=ngram)
    x_train_ = vectorizer.fit_transform(X_train)

    # Vectorize validation texts.
    x_val = vectorizer.transform(X_test)
    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(topk, x_train_.shape[1]))
    selector.fit(x_train_, y_train)
    x_train = selector.transform(x_train_).astype('float32')
    x_val = selector.transform(x_val).astype('float32')   

    clf = clf()
    clf.fit(x_train, y_train)
    y_pred_test = clf.predict(x_val)
    y_pred_train = clf.predict(x_train)
    train_acc, test_acc = accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred_test)
    if display:
        print(f"Train classification accuracy = {train_acc},\n Test classification accuracy = {test_acc}")

    results_df = results_df.append(pd.DataFrame({
        'Classifier': [clf.__class__.__name__],
        'By': [by],
        'Preprocessed': [preprocessed],
        'Vectorizer': [vectorizer.__class__.__name__],
        'Ngram': [ngram],
        'TopKFeatures': [topk],
        'TrainAccuracy': [train_acc],
        'TestAccuracy': [test_acc]
    }), ignore_index=True)

In [None]:
raw_dataset_sh_ = train_test_split(df.short_description, df['category'], 
                                                    test_size=0.2, stratify=df.category,
                                                    random_state=1)
processed_split = (X_train, X_test, y_train, y_test)

## Without over/under-sampling

In [None]:
TOP_KS = np.arange(5000, 15000, 1000)

### Training on preprocessed data

#### Using CountVectorizer

##### Unigrams only

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(processed_split, 1, 'description', MultinomialNB, CountVectorizer, topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.453025229957307,
 Test classification accuracy = 0.414453212516492


For top 6000 features selected:

Train classification accuracy = 0.4597341332569921,
 Test classification accuracy = 0.41831171740808043


For top 7000 features selected:

Train classification accuracy = 0.4630512440721425,
 Test classification accuracy = 0.4201289487441189


For top 8000 features selected:

Train classification accuracy = 0.46482493372001843,
 Test classification accuracy = 0.4208259689825994


For top 9000 features selected:

Train classification accuracy = 0.467880658692324,
 Test classification accuracy = 0.42197107366010306


For top 10000 features selected:

Train classification accuracy = 0.472081502595188,
 Test classification accuracy = 0.4242363894351647


For top 11000 features selected:

Train classification accuracy = 0.47398588516448636,
 Test classification accuracy = 0.4246346867142964


For top 12000 features selected:

In [None]:
results_df.head()

Unnamed: 0,Classifier,By,Preprocessed,Vectorizer,Ngram,TopKFeatures,TrainAccuracy,TestAccuracy
0,MultinomialNB,description,1,CountVectorizer,"(1, 1)",5000,0.453025,0.414453
1,MultinomialNB,description,1,CountVectorizer,"(1, 1)",6000,0.459734,0.418312
2,MultinomialNB,description,1,CountVectorizer,"(1, 1)",7000,0.463051,0.420129
3,MultinomialNB,description,1,CountVectorizer,"(1, 1)",8000,0.464825,0.420826
4,MultinomialNB,description,1,CountVectorizer,"(1, 1)",9000,0.467881,0.421971


##### Unigrams and Bigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(processed_split, 1, 'description', MultinomialNB, CountVectorizer, ngram=(1, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.43917178028652865,
 Test classification accuracy = 0.4083044982698962


For top 6000 features selected:

Train classification accuracy = 0.4464470195790443,
 Test classification accuracy = 0.41191406736202735


For top 7000 features selected:

Train classification accuracy = 0.4479966642187675,
 Test classification accuracy = 0.41315874635931393


For top 8000 features selected:

Train classification accuracy = 0.44713782502084864,
 Test classification accuracy = 0.4132832142590426


For top 9000 features selected:

Train classification accuracy = 0.4463910083270061,
 Test classification accuracy = 0.41318363993925966


For top 10000 features selected:

Train classification accuracy = 0.4454325935699083,
 Test classification accuracy = 0.41271066192029077


For top 11000 features selected:

Train classification accuracy = 0.4441381113005813,
 Test classification accuracy = 0.4119389609419731


For top 12000 features sel

#### Using TFIDF vectorizer

##### Unigrams only

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(processed_split, 1, 'description', MultinomialNB, TfidfVectorizer, ngram=(1, 1), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.35364882189666547,
 Test classification accuracy = 0.34455204002887657


For top 6000 features selected:

Train classification accuracy = 0.3588454213913195,
 Test classification accuracy = 0.349157352318837


For top 7000 features selected:

Train classification accuracy = 0.3564307140812288,
 Test classification accuracy = 0.34719075950312417


For top 8000 features selected:

Train classification accuracy = 0.35699082660161063,
 Test classification accuracy = 0.34763884394214734


For top 9000 features selected:

Train classification accuracy = 0.3556465565526941,
 Test classification accuracy = 0.3460456548256205


For top 10000 features selected:

Train classification accuracy = 0.36009633935350566,
 Test classification accuracy = 0.3482611834407906


For top 11000 features selected:

Train classification accuracy = 0.3604012895034914,
 Test classification accuracy = 0.34791267332155035


For top 12000 features sel

##### Unigrams and Bigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(processed_split, 1, 'description', MultinomialNB, TfidfVectorizer, ngram=(1, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.2694639100832701,
 Test classification accuracy = 0.2991461502078614


For top 6000 features selected:

Train classification accuracy = 0.27358384884430115,
 Test classification accuracy = 0.3030544422593413


For top 7000 features selected:

Train classification accuracy = 0.270241844139356,
 Test classification accuracy = 0.2990714694680242


For top 8000 features selected:

Train classification accuracy = 0.26737282334051105,
 Test classification accuracy = 0.29538721963605585


For top 9000 features selected:

Train classification accuracy = 0.264945669085523,
 Test classification accuracy = 0.29210126708321926


For top 10000 features selected:

Train classification accuracy = 0.26276745372848237,
 Test classification accuracy = 0.28946254760897167


For top 11000 features selected:

Train classification accuracy = 0.2607572721275563,
 Test classification accuracy = 0.2870976575141271


For top 12000 features selec

### Result df after running training on processed data

In [None]:
print(len(results_df))
results_df.sort_values(by=['TrainAccuracy', 'TestAccuracy'], ascending=[False, False]).head(5)

40


Unnamed: 0,Classifier,By,Preprocessed,Vectorizer,Ngram,TopKFeatures,TrainAccuracy,TestAccuracy
9,MultinomialNB,description,1,CountVectorizer,"(1, 1)",14000,0.482388,0.426178
8,MultinomialNB,description,1,CountVectorizer,"(1, 1)",13000,0.48207,0.426178
7,MultinomialNB,description,1,CountVectorizer,"(1, 1)",12000,0.479077,0.425581
6,MultinomialNB,description,1,CountVectorizer,"(1, 1)",11000,0.473986,0.424635
5,MultinomialNB,description,1,CountVectorizer,"(1, 1)",10000,0.472082,0.424236


### Training on raw data

#### Using CountVectorizer

##### Unigrams only

In [None]:
# raw_dataset_sh_ - raw dataset train-test split
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(raw_dataset_sh_, 0, 'description', MultinomialNB, CountVectorizer, topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.44793442949428064,
 Test classification accuracy = 0.41198874810186453


For top 6000 features selected:

Train classification accuracy = 0.454935835999054,
 Test classification accuracy = 0.4157476786736701


For top 7000 features selected:

Train classification accuracy = 0.45702069926936434,
 Test classification accuracy = 0.4168429961912823


For top 8000 features selected:

Train classification accuracy = 0.46163851582629045,
 Test classification accuracy = 0.41816235592840606


For top 9000 features selected:

Train classification accuracy = 0.4634433228364098,
 Test classification accuracy = 0.4191829927061811


For top 10000 features selected:

Train classification accuracy = 0.4665426121158562,
 Test classification accuracy = 0.4211495855218939


For top 11000 features selected:

Train classification accuracy = 0.4698472759861092,
 Test classification accuracy = 0.42264320031863784


For top 12000 features sele

##### Unigrams and Bigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(raw_dataset_sh_, 0, 'description', MultinomialNB, CountVectorizer, ngram=(1, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.42051380988536363,
 Test classification accuracy = 0.39227303278484477


For top 6000 features selected:

Train classification accuracy = 0.42929512951046167,
 Test classification accuracy = 0.397177068034154


For top 7000 features selected:

Train classification accuracy = 0.43520120486426606,
 Test classification accuracy = 0.40158323168454857


For top 8000 features selected:

Train classification accuracy = 0.4411197271629678,
 Test classification accuracy = 0.4053172686764083


For top 9000 features selected:

Train classification accuracy = 0.44459242478933547,
 Test classification accuracy = 0.4071593935923925


For top 10000 features selected:

Train classification accuracy = 0.4442688042220037,
 Test classification accuracy = 0.4080555624704389


For top 11000 features selected:

Train classification accuracy = 0.44359044572509676,
 Test classification accuracy = 0.4082547111100047


For top 12000 features sel

##### Bigrams only

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(raw_dataset_sh_, 0, 'description', MultinomialNB, CountVectorizer, ngram=(2, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.3305846330018297,
 Test classification accuracy = 0.30614124617261207


For top 6000 features selected:

Train classification accuracy = 0.3301240960406268,
 Test classification accuracy = 0.30522018371462


For top 7000 features selected:

Train classification accuracy = 0.3288296137712998,
 Test classification accuracy = 0.3044484827363023


For top 8000 features selected:

Train classification accuracy = 0.32752890802952417,
 Test classification accuracy = 0.3041497597769535


For top 9000 features selected:

Train classification accuracy = 0.32637134215406827,
 Test classification accuracy = 0.3032784844788529


For top 10000 features selected:

Train classification accuracy = 0.325282234475548,
 Test classification accuracy = 0.30238231560080653


For top 11000 features selected:

Train classification accuracy = 0.32711193537546207,
 Test classification accuracy = 0.3029797615195041


For top 12000 features selecte

#### Using TFIDF vectorizer

##### Unigrams only

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(raw_dataset_sh_, 0, 'description', MultinomialNB, TfidfVectorizer, ngram=(1, 1), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.3539662189915485,
 Test classification accuracy = 0.34577182544621743


For top 6000 features selected:

Train classification accuracy = 0.35713396646793044,
 Test classification accuracy = 0.3477882054218217


For top 7000 features selected:

Train classification accuracy = 0.35519224306394,
 Test classification accuracy = 0.3451494859475741


For top 8000 features selected:

Train classification accuracy = 0.35451388456703303,
 Test classification accuracy = 0.34422842348958205


For top 9000 features selected:

Train classification accuracy = 0.35499931541803065,
 Test classification accuracy = 0.3438052326305046


For top 10000 features selected:

Train classification accuracy = 0.3550739970874149,
 Test classification accuracy = 0.3421124691941948


For top 11000 features selected:

Train classification accuracy = 0.355690120859835,
 Test classification accuracy = 0.3422618306738692


For top 12000 features selecte

##### Unigrams and Bigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(raw_dataset_sh_, 0, 'description', MultinomialNB, TfidfVectorizer, ngram=(1, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.2768511718798621,
 Test classification accuracy = 0.28667446665504964


For top 6000 features selected:

Train classification accuracy = 0.2814254241296474,
 Test classification accuracy = 0.29020935500734363


For top 7000 features selected:

Train classification accuracy = 0.28402683561319875,
 Test classification accuracy = 0.2930970102810485


For top 8000 features selected:

Train classification accuracy = 0.28744352198752815,
 Test classification accuracy = 0.29531253889621867


For top 9000 features selected:

Train classification accuracy = 0.2841637520070699,
 Test classification accuracy = 0.2922257349829479


For top 10000 features selected:

Train classification accuracy = 0.2810582392551748,
 Test classification accuracy = 0.2887904209504369


For top 11000 features selected:

Train classification accuracy = 0.2784817216614182,
 Test classification accuracy = 0.28630106295586366


For top 12000 features sel

##### Bigrams only

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(raw_dataset_sh_, 0, 'description', MultinomialNB, TfidfVectorizer, ngram=(2, 2), topk=i)
    print('\n')

### *Results_df* after recording accuracies on raw data

In [None]:
results_df.sort_values(by=['TrainAccuracy', 'TestAccuracy'], ascending=[False, False]).head(5)

## Using imblearn.RandomOverSampler

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)

cv_u = CountVectorizer()
X = cv_u.fit_transform(df_1.processed_description)
y = df_1.category

X_resampled, y_resampled = ros.fit_resample(X, y)
from collections import Counter
print(sorted(Counter(y_resampled).items()))

[('ARTS', 32739), ('ARTS & CULTURE', 32739), ('BLACK VOICES', 32739), ('BUSINESS', 32739), ('COLLEGE', 32739), ('COMEDY', 32739), ('CRIME', 32739), ('CULTURE & ARTS', 32739), ('DIVORCE', 32739), ('EDUCATION', 32739), ('ENTERTAINMENT', 32739), ('ENVIRONMENT', 32739), ('FIFTY', 32739), ('FOOD & DRINK', 32739), ('GOOD NEWS', 32739), ('GREEN', 32739), ('HEALTHY LIVING', 32739), ('HOME & LIVING', 32739), ('IMPACT', 32739), ('LATINO VOICES', 32739), ('MEDIA', 32739), ('MONEY', 32739), ('PARENTING', 32739), ('PARENTS', 32739), ('POLITICS', 32739), ('QUEER VOICES', 32739), ('RELIGION', 32739), ('SCIENCE', 32739), ('SPORTS', 32739), ('STYLE', 32739), ('STYLE & BEAUTY', 32739), ('TASTE', 32739), ('TECH', 32739), ('THE WORLDPOST', 32739), ('TRAVEL', 32739), ('WEDDINGS', 32739), ('WEIRD NEWS', 32739), ('WELLNESS', 32739), ('WOMEN', 32739), ('WORLD NEWS', 32739), ('WORLDPOST', 32739)]


In [None]:
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_resampled, y_resampled, test_size=0.2)

classifier_u = MultinomialNB()
classifier_u.fit(X_train_over, y_train_over)
y_pred_over = classifier_u.predict(X_test_over)

accuracy_score(y_test_over, y_pred_over)

0.6288124860314386

Checking our over-sampled classifier on the non-over-sampled test set

In [None]:
# y_pred_by_over = classifier_u.predict(X_test)
# accuracy_score(y_test, y_pred_by_over)

# Causes ValueError: X has 1192532 features, but MultinomialNB is expecting 88702 features as input.

## Using imlearn.under_sampling.ClusterCentroids

In [None]:
# from imblearn.under_sampling import ClusterCentroids

In [None]:
# cc = ClusterCentroids(random_state=0)
# X_under, y_under = cc.fit_resample(X, y)
# print(sorted(Counter(y_under).items()))

**IMPORTANT**

The above cell takes too long to execute, so I did not get any results

## Manual undersampling using pandas sample [although seems inefficient]

In [None]:
els = list(Counter(y).items())
min_tuple = min(els, key=lambda x: x[1])
print(min_tuple)

('EDUCATION', 1004)


In [None]:
empty_df = pd.DataFrame(columns=df.columns)
for i in range(len(els)):
    curr_df = df_1[df_1['category'] == els[i][0]]
    sampled_df = curr_df.sample(min_tuple[1])
    empty_df = empty_df.append(sampled_df)


empty_df.head()

Unnamed: 0,category,headline,authors,link,short_description,date,processed_description
66018,CRIME,,,,,,could easily googled didn’t
64882,CRIME,,,,,,part plea deal samuel mchenry admitted sexual ...
103858,CRIME,,,,,,
4411,CRIME,,,,,,“i grossly underestimated resources would nece...
11932,CRIME,,,,,,want cops come fk that’s want heard saying aud...


In [None]:
Counter(empty_df.category.values).items()

dict_items([('CRIME', 1004), ('ENTERTAINMENT', 1004), ('WORLD NEWS', 1004), ('IMPACT', 1004), ('POLITICS', 1004), ('WEIRD NEWS', 1004), ('BLACK VOICES', 1004), ('WOMEN', 1004), ('COMEDY', 1004), ('QUEER VOICES', 1004), ('SPORTS', 1004), ('BUSINESS', 1004), ('TRAVEL', 1004), ('MEDIA', 1004), ('TECH', 1004), ('RELIGION', 1004), ('SCIENCE', 1004), ('LATINO VOICES', 1004), ('EDUCATION', 1004), ('COLLEGE', 1004), ('PARENTS', 1004), ('ARTS & CULTURE', 1004), ('STYLE', 1004), ('GREEN', 1004), ('TASTE', 1004), ('HEALTHY LIVING', 1004), ('THE WORLDPOST', 1004), ('GOOD NEWS', 1004), ('WORLDPOST', 1004), ('FIFTY', 1004), ('ARTS', 1004), ('WELLNESS', 1004), ('PARENTING', 1004), ('HOME & LIVING', 1004), ('STYLE & BEAUTY', 1004), ('DIVORCE', 1004), ('WEDDINGS', 1004), ('FOOD & DRINK', 1004), ('MONEY', 1004), ('ENVIRONMENT', 1004), ('CULTURE & ARTS', 1004)])

In [None]:
cv_u = CountVectorizer()
X_u = cv_u.fit_transform(empty_df.processed_description)

X_train, X_test, y_train, y_test = train_test_split(X_u, empty_df['category'].values, test_size=0.2)

classifier_u = MultinomialNB()
classifier_u.fit(X_train, y_train)
y_pred = classifier_u.predict(X_test)

accuracy_score(y_test, y_pred)

0.25482813069355037

# Model, making classifications based on headline

In [None]:
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(df.headline, df['category'], 
                                                    test_size=0.2, stratify=df.category,
                                                    random_state=1)

df['processed_headline'] = df['headline'].apply(lambda x: ' '.join(cleaning_function(x)))
processed_headline_split = train_test_split(df.processed_headline, df['category'], 
                                                    test_size=0.2, stratify=df.category,
                                                    random_state=1)
raw_headline_split = (X_train_h, X_test_h, y_train_h, y_test_h)

In [None]:
results_df.sort_values(by=['TrainAccuracy', 'TestAccuracy'], ascending=[False, False]).iloc[0]

Classifier         MultinomialNB
By                   description
Preprocessed                   1
Vectorizer       CountVectorizer
Ngram                     (1, 1)
TopKFeatures               14000
TrainAccuracy           0.482388
TestAccuracy            0.426178
Name: 9, dtype: object

### Without preprocessing

#### Unigrams only

In [None]:
for i in TOP_KS:
    print(f"For top {i} features selected:\n")
    write_df(raw_headline_split, 0, 'headline', MultinomialNB, CountVectorizer, ngram=(1, 1), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.580569074320708,
 Test classification accuracy = 0.537626646087974


For top 6000 features selected:

Train classification accuracy = 0.5885786833621688,
 Test classification accuracy = 0.5408379179009734


For top 7000 features selected:

Train classification accuracy = 0.5952191284649183,
 Test classification accuracy = 0.544024296134027


For top 8000 features selected:

Train classification accuracy = 0.5996191234861403,
 Test classification accuracy = 0.5452440815513679


For top 9000 features selected:

Train classification accuracy = 0.6030669272227132,
 Test classification accuracy = 0.546911951407732


For top 10000 features selected:

Train classification accuracy = 0.6064649431796966,
 Test classification accuracy = 0.5476338652261582


For top 11000 features selected:

Train classification accuracy = 0.6097135957979114,
 Test classification accuracy = 0.5487042891638246


For top 12000 features selected:

Tr

#### Unigrams and Bigrams

In [None]:
for i in TOP_KS:
    print(f"For top {i} features selected:\n")
    write_df(raw_headline_split, 0, 'headline', MultinomialNB, CountVectorizer, ngram=(1, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.5515365753475809,
 Test classification accuracy = 0.518856886808892


For top 6000 features selected:

Train classification accuracy = 0.5596021956410799,
 Test classification accuracy = 0.5237609220582012


For top 7000 features selected:

Train classification accuracy = 0.5666782838152375,
 Test classification accuracy = 0.5277936820094098


For top 8000 features selected:

Train classification accuracy = 0.5740593221393809,
 Test classification accuracy = 0.5310298474023549


For top 9000 features selected:

Train classification accuracy = 0.5795422013666746,
 Test classification accuracy = 0.5342162256354086


For top 10000 features selected:

Train classification accuracy = 0.5833883073399634,
 Test classification accuracy = 0.5370292001692764


For top 11000 features selected:

Train classification accuracy = 0.5825854793940827,
 Test classification accuracy = 0.5362077120310672


For top 12000 features selected:


### With preprocessing

#### Unigrams

In [None]:
for i in TOP_KS:
    print(f"For top {i} features selected:\n")
    write_df(processed_headline_split, 0, 'headline', MultinomialNB, CountVectorizer, ngram=(1, 1), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.5719993527588654,
 Test classification accuracy = 0.5297851684050683


For top 6000 features selected:

Train classification accuracy = 0.5801521016666459,
 Test classification accuracy = 0.5343157999551915


For top 7000 features selected:

Train classification accuracy = 0.5903523730100447,
 Test classification accuracy = 0.5393940902641209


For top 8000 features selected:

Train classification accuracy = 0.5930533600527751,
 Test classification accuracy = 0.5411615344402678


For top 9000 features selected:

Train classification accuracy = 0.5982063952402883,
 Test classification accuracy = 0.543003659356252


For top 10000 features selected:

Train classification accuracy = 0.6030731506951619,
 Test classification accuracy = 0.5459411017898484


For top 11000 features selected:

Train classification accuracy = 0.6064711666521452,
 Test classification accuracy = 0.5469866321475692


For top 12000 features selected:


#### Unigrams and Bigrams

In [None]:
for i in TOP_KS:
    print(f"For top {i} features selected:\n")
    write_df(processed_headline_split, 0, 'headline', MultinomialNB, CountVectorizer, ngram=(1, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.5476282346498053,
 Test classification accuracy = 0.5163177416544273


For top 6000 features selected:

Train classification accuracy = 0.5583699480962397,
 Test classification accuracy = 0.5227402852804262


For top 7000 features selected:

Train classification accuracy = 0.566709401177481,
 Test classification accuracy = 0.5280426178088671


For top 8000 features selected:

Train classification accuracy = 0.5753102401015671,
 Test classification accuracy = 0.5334196310771452


For top 9000 features selected:

Train classification accuracy = 0.5745758703526219,
 Test classification accuracy = 0.5330462273779593


For top 10000 features selected:

Train classification accuracy = 0.5737543719893952,
 Test classification accuracy = 0.533071120957905


For top 11000 features selected:

Train classification accuracy = 0.5744389539587508,
 Test classification accuracy = 0.5339672898359513


For top 12000 features selected:



### Using oversampling for classification based on headline

#### For non-preprocessed data

In [None]:
ros = RandomOverSampler(random_state=0)

cv_u = CountVectorizer()
X = cv_u.fit_transform(df.headline)
y = df.category

X_resampled, y_resampled = ros.fit_resample(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2)

classifier_u = MultinomialNB()
classifier_u.fit(X_train, y_train)
y_pred = classifier_u.predict(X_test)

accuracy_score(y_test, y_pred)

0.7501005736422558

#### For preprocessed data

In [None]:
ros = RandomOverSampler(random_state=0)

cv_u = CountVectorizer()
X = cv_u.fit_transform(new_df.processed_headline)
y = new_df.category

X_resampled, y_resampled = ros.fit_resample(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2)

classifier_u = MultinomialNB()
classifier_u.fit(X_train, y_train)
y_pred = classifier_u.predict(X_test)

accuracy_score(y_test, y_pred)

0.7662929300454444

It might be because some news samples, present in the test set, are also present in the test set, which drives up the accuracy score./?

# Model, making classifications based on full_text: headdline + short_description

In [None]:
df['full_text'] = df['short_description'] + df['headline']

In [None]:
X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(df.full_text, df['category'], 
                                                    test_size=0.2, stratify=df.category,
                                                    random_state=1)



df['processed_fulltext'] = df['full_text'].apply(lambda x: ' '.join(cleaning_function(x)))
processed_fulltext_split = train_test_split(df.processed_fulltext, df['category'], 
                                                    test_size=0.2, stratify=df.category,
                                                    random_state=1)
raw_fulltext_split = (X_train_f, X_test_f, y_train_f, y_test_f)

## Without preprocessing

### CountVectorizer

#### Unigrams

In [None]:
for i in TOP_KS:
    print(f"For top {i} features selected:\n")
    write_df(raw_fulltext_split, 0, 'full_text', MultinomialNB, CountVectorizer, ngram=(1, 1), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.6058363724623791,
 Test classification accuracy = 0.5591098055811407


For top 6000 features selected:

Train classification accuracy = 0.6161860071445464,
 Test classification accuracy = 0.5645366060093102


For top 7000 features selected:

Train classification accuracy = 0.6182024122179212,
 Test classification accuracy = 0.5662791566055114


For top 8000 features selected:

Train classification accuracy = 0.6209531870402409,
 Test classification accuracy = 0.569565109158348


For top 9000 features selected:

Train classification accuracy = 0.6240649232645847,
 Test classification accuracy = 0.5709840432152548


For top 10000 features selected:

Train classification accuracy = 0.6297967413898259,
 Test classification accuracy = 0.5739463792287969


For top 11000 features selected:

Train classification accuracy = 0.6302821722408235,
 Test classification accuracy = 0.5736227626895024


For top 12000 features selected:


#### Unigrams and Bigrams

In [None]:
for i in TOP_KS:
    print(f"For top {i} features selected:\n")
    write_df(raw_fulltext_split, 0, 'full_text', MultinomialNB, CountVectorizer, ngram=(1, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.5714330167660348,
 Test classification accuracy = 0.5390455801448807


For top 6000 features selected:

Train classification accuracy = 0.5802827945880683,
 Test classification accuracy = 0.5445470613128874


For top 7000 features selected:

Train classification accuracy = 0.5883484148815673,
 Test classification accuracy = 0.5493017350825222


For top 8000 features selected:

Train classification accuracy = 0.5965260576791427,
 Test classification accuracy = 0.5529113041746534


For top 9000 features selected:

Train classification accuracy = 0.6007953597789423,
 Test classification accuracy = 0.5554006621692266


For top 10000 features selected:

Train classification accuracy = 0.6054069528634197,
 Test classification accuracy = 0.5582634238629858


For top 11000 features selected:

Train classification accuracy = 0.6090974720254914,
 Test classification accuracy = 0.5604291653182644


For top 12000 features selected:

#### Bigrams

In [None]:
for i in TOP_KS:
    print(f"For top {i} features selected:\n")
    write_df(raw_fulltext_split, 0, 'full_text', MultinomialNB, CountVectorizer, ngram=(2, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.43783373371006085,
 Test classification accuracy = 0.4101964103457718


For top 6000 features selected:

Train classification accuracy = 0.45049850014313986,
 Test classification accuracy = 0.41955639640536707


For top 7000 features selected:

Train classification accuracy = 0.4631072553241807,
 Test classification accuracy = 0.42834383012621047


For top 8000 features selected:

Train classification accuracy = 0.471284898121756,
 Test classification accuracy = 0.43347190759503124


For top 9000 features selected:

Train classification accuracy = 0.4803711678968397,
 Test classification accuracy = 0.43932189888227824


For top 10000 features selected:

Train classification accuracy = 0.487833111362816,
 Test classification accuracy = 0.4442010405516417


For top 11000 features selected:

Train classification accuracy = 0.4869804956373458,
 Test classification accuracy = 0.44367827537278137


For top 12000 features sele

### TfidfVectorizer

#### Unigrams

In [None]:
for i in TOP_KS:
    print(f"For top {i} features selected:\n")
    write_df(raw_fulltext_split, 0, 'full_text', MultinomialNB, TfidfVectorizer, ngram=(1, 1), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.48479605680785653,
 Test classification accuracy = 0.47449652734559755


For top 6000 features selected:

Train classification accuracy = 0.4881256145679043,
 Test classification accuracy = 0.47648801374125616


For top 7000 features selected:

Train classification accuracy = 0.4809686212519137,
 Test classification accuracy = 0.4685220681586219


For top 8000 features selected:

Train classification accuracy = 0.47678022429394706,
 Test classification accuracy = 0.46521122202583953


For top 9000 features selected:

Train classification accuracy = 0.4733510909747203,
 Test classification accuracy = 0.4617510144133828


For top 10000 features selected:

Train classification accuracy = 0.47484472436240527,
 Test classification accuracy = 0.4620746309526773


For top 11000 features selected:

Train classification accuracy = 0.46927471652082997,
 Test classification accuracy = 0.45652336262477905


For top 12000 features s

#### Unigrams and Bigrams

In [None]:
for i in TOP_KS:
    print(f"For top {i} features selected:\n")
    write_df(raw_fulltext_split, 0, 'full_text', MultinomialNB, TfidfVectorizer, ngram=(1, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.36424117200433154,
 Test classification accuracy = 0.38368474770356725


For top 6000 features selected:

Train classification accuracy = 0.3678819033868137,
 Test classification accuracy = 0.3870702745761868


For top 7000 features selected:

Train classification accuracy = 0.3721823228488568,
 Test classification accuracy = 0.39003261058972893


For top 8000 features selected:



KeyboardInterrupt: ignored

## With preprocessing

### CountVectorizer

#### Unigrams

In [None]:
for i in TOP_KS:
    print(f"For top {i} features selected:\n")
    write_df(processed_fulltext_split, 1, 'full_text', MultinomialNB, CountVectorizer, ngram=(1, 1), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.5973102152076772,
 Test classification accuracy = 0.5491025864429564


For top 6000 features selected:

Train classification accuracy = 0.6066329769358111,
 Test classification accuracy = 0.5541808767518857


For top 7000 features selected:

Train classification accuracy = 0.608985449521415,
 Test classification accuracy = 0.5552015135296607


For top 8000 features selected:

Train classification accuracy = 0.6085498064500069,
 Test classification accuracy = 0.5562719374673272


For top 9000 features selected:

Train classification accuracy = 0.6145430104180929,
 Test classification accuracy = 0.5596823579198925


For top 10000 features selected:

Train classification accuracy = 0.6152400393323458,
 Test classification accuracy = 0.5605038460581017


For top 11000 features selected:

Train classification accuracy = 0.6162606888139306,
 Test classification accuracy = 0.5613253341963108


For top 12000 features selected:


#### Unigrams and Bigrams

In [None]:
for i in TOP_KS:
    print(f"For top {i} features selected:\n")
    write_df(processed_fulltext_split, 1, 'full_text', MultinomialNB, CountVectorizer, ngram=(1, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.573499209618999,
 Test classification accuracy = 0.5373777102885166


For top 6000 features selected:

Train classification accuracy = 0.583058463300183,
 Test classification accuracy = 0.5456174852505539


For top 7000 features selected:

Train classification accuracy = 0.5888649630948084,
 Test classification accuracy = 0.5490279057031192


For top 8000 features selected:

Train classification accuracy = 0.5959659451587608,
 Test classification accuracy = 0.5514923701177467


For top 9000 features selected:

Train classification accuracy = 0.6017786684258348,
 Test classification accuracy = 0.5540813024321027


For top 10000 features selected:

Train classification accuracy = 0.6058737132970712,
 Test classification accuracy = 0.5557740658684125


For top 11000 features selected:

Train classification accuracy = 0.6105662115233816,
 Test classification accuracy = 0.5599312937193498


For top 12000 features selected:



### TfidfVectorizer

#### Unigrams

In [None]:
for i in TOP_KS:
    print(f"For top {i} features selected:\n")
    write_df(processed_fulltext_split, 1, 'full_text', MultinomialNB, TfidfVectorizer, ngram=(1, 1), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.4663621314148442,
 Test classification accuracy = 0.46065569689577057


For top 6000 features selected:

Train classification accuracy = 0.4693618451351116,
 Test classification accuracy = 0.46232356675213465


For top 7000 features selected:

Train classification accuracy = 0.4630948083792833,
 Test classification accuracy = 0.4570710213835852


For top 8000 features selected:

Train classification accuracy = 0.4563423407724574,
 Test classification accuracy = 0.4507231584974235


For top 9000 features selected:

Train classification accuracy = 0.456728196064276,
 Test classification accuracy = 0.4503497547982375


For top 10000 features selected:

Train classification accuracy = 0.45183654671960766,
 Test classification accuracy = 0.44624231410719173


For top 11000 features selected:



KeyboardInterrupt: ignored

#### Unigrams and Bigrams

In [None]:
for i in TOP_KS:
    print(f"For top {i} features selected:\n")
    write_df(processed_fulltext_split, 1, 'full_text', MultinomialNB, TfidfVectorizer, ngram=(1, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.33899876775245513,
 Test classification accuracy = 0.3865724029772722


For top 6000 features selected:

Train classification accuracy = 0.34685278998269875,
 Test classification accuracy = 0.39209877772522467


For top 7000 features selected:

Train classification accuracy = 0.34844599892956274,
 Test classification accuracy = 0.3928206915436509


For top 8000 features selected:

Train classification accuracy = 0.3519933782253146,
 Test classification accuracy = 0.39441388066017774


For top 9000 features selected:

Train classification accuracy = 0.35569634433228364,
 Test classification accuracy = 0.396803664334968


For top 10000 features selected:

Train classification accuracy = 0.35768785551586363,
 Test classification accuracy = 0.3975006845734485


For top 11000 features selected:

Train classification accuracy = 0.36062533451164414,
 Test classification accuracy = 0.39832217271165765


For top 12000 features s

KeyboardInterrupt: ignored

# Checking final results and saving it to .csv table for later use

In [None]:
print(len(results_df))

217


Showing top 5 rows by test accuracy

In [None]:
results_df.sort_values(by='TestAccuracy', ascending=False).head(5)

Unnamed: 0,Classifier,By,Preprocessed,Vectorizer,Ngram,TopKFeatures,TrainAccuracy,TestAccuracy
146,MultinomialNB,full_text,0,CountVectorizer,"(1, 1)",10000,0.629797,0.573946
147,MultinomialNB,full_text,0,CountVectorizer,"(1, 1)",11000,0.630282,0.573623
148,MultinomialNB,full_text,0,CountVectorizer,"(1, 1)",12000,0.630786,0.573075
149,MultinomialNB,full_text,0,CountVectorizer,"(1, 1)",13000,0.632193,0.572826
150,MultinomialNB,full_text,0,CountVectorizer,"(1, 1)",14000,0.633319,0.572229


In [None]:
results_df[results_df.TopKFeatures == 9000].sort_values(by='TestAccuracy', ascending=False).head(5)

Unnamed: 0,Classifier,By,Preprocessed,Vectorizer,Ngram,TopKFeatures,TrainAccuracy,TestAccuracy
145,MultinomialNB,full_text,0,CountVectorizer,"(1, 1)",9000,0.624065,0.570984
188,MultinomialNB,full_text,1,CountVectorizer,"(1, 1)",9000,0.614543,0.559682
155,MultinomialNB,full_text,0,CountVectorizer,"(1, 2)",9000,0.600795,0.555401
198,MultinomialNB,full_text,1,CountVectorizer,"(1, 2)",9000,0.601779,0.554081
105,MultinomialNB,headline,0,CountVectorizer,"(1, 1)",9000,0.603067,0.546912


In [None]:
results_df.to_csv('/content/drive/MyDrive/SDLC/news_analysis_project/resultsMB.csv')