<a href="https://colab.research.google.com/github/romapavelko01/NLP_SDLC_project/blob/classifications/ComplementNB_for_all.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif

In [None]:
filename = "/content/drive/MyDrive/SDLC/news_analysis_project/data/final_news_category_dataset.json"
df = pd.read_json(filename, orient='split')
df.head(3)

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26


## Text preprocessing

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import re
import string


def cleaning_function(sentence):
    """
    Function needed to perform data preprocessing: removing punctuation symbols,
    stop_words and other random things in order to obtain clean text
    """
    # the following line removes numbers from text
    result = re.sub(r'\d+', '', sentence.lower())

    # the following line removes any punctuation from the text
    result = result.translate(str.maketrans('','',string.punctuation))
    return [word for word in result.split() if not word in stop_words]

In [None]:
df['processed_description'] = df['short_description'].apply(lambda x: ' '.join(cleaning_function(x)))
df['processed_headline'] = df['headline'].apply(lambda x: ' '.join(cleaning_function(x)))
df['full_text'] = df['headline'] + df['short_description']
df['processed_full_text'] = df['processed_headline'] + df['processed_description']

**ComplementNB is said to be suitable for imbalanced datasets, so that is why an entire notebook is created for this model**

# Splitting the data into train/test with equal distribution of labels in both sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'category'], df['category'], 
                                                    test_size=0.2, stratify=df.category,
                                                    random_state=1)

processed_split_description = (X_train['processed_description'], X_test['processed_description'], y_train, y_test)
processed_split_headline = (X_train['processed_headline'], X_test['processed_headline'], y_train, y_test)
processed_split_full_text = (X_train['processed_full_text'], X_test['processed_full_text'], y_train, y_test)

raw_split_description = (X_train['short_description'], X_test['short_description'], y_train, y_test)
raw_split_headline = (X_train['headline'], X_test['headline'], y_train, y_test)
raw_split_full_text = (X_train['full_text'], X_test['full_text'], y_train, y_test)

# Creating a results table to store accuracies of combinations of vectorizer parameters for ComplementNB and defining a function to record data to the table

In [None]:
results_df = pd.DataFrame(columns=['Classifier', 'By', 'Preprocessed', 'Vectorizer', 'Ngram', 'TopKFeatures', 'TrainAccuracy', 'TestAccuracy'])

def write_df(dataset, preprocessed, by, clf, vect, ngram=(1, 1), topk=6000, display=True):
    """
    Function to calculate accuracies for the given dataset split, 
    whether it is raw or preprocessed; 
    by - in ['headline', 'descirption', 'full_text']
    classifier, 
    vectorizer, ngram parameter, and how many top features to consider;
    record calculated accuracies to the results_df;
    and, once display is set to True, print classification accuracies
    """
    global results_df

    X_train, X_test, y_train, y_test = dataset

    vectorizer = vect(ngram_range=ngram)
    x_train_ = vectorizer.fit_transform(X_train)

    # Vectorize validation texts.
    x_val = vectorizer.transform(X_test)
    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(topk, x_train_.shape[1]))
    selector.fit(x_train_, y_train)
    x_train = selector.transform(x_train_).astype('float32')
    x_val = selector.transform(x_val).astype('float32')   

    clf = clf()
    clf.fit(x_train, y_train)
    y_pred_test = clf.predict(x_val)
    y_pred_train = clf.predict(x_train)
    train_acc, test_acc = accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred_test)
    if display:
        print(f"Train classification accuracy = {train_acc},\n Test classification accuracy = {test_acc}")

    results_df = results_df.append(pd.DataFrame({
        'Classifier': [clf.__class__.__name__],
        'By': [by],
        'Preprocessed': [preprocessed],
        'Vectorizer': [vectorizer.__class__.__name__],
        'Ngram': [ngram],
        'TopKFeatures': [topk],
        'TrainAccuracy': [train_acc],
        'TestAccuracy': [test_acc]
    }), ignore_index=True)

In [None]:
TOP_KS = np.arange(5000, 15000, 1000)

# Classifying  on short description

## With preprocessing

### CountVectorizer

#### Unigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(processed_split_description, 1, 'description', ComplementNB, CountVectorizer, topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.4114524340000747,
 Test classification accuracy = 0.3924970750043564


For top 6000 features selected:

Train classification accuracy = 0.4178439402048767,
 Test classification accuracy = 0.39645515421572775


For top 7000 features selected:

Train classification accuracy = 0.4230654335893255,
 Test classification accuracy = 0.39894451221030097


For top 8000 features selected:

Train classification accuracy = 0.4278450604299175,
 Test classification accuracy = 0.3998157875084016


For top 9000 features selected:

Train classification accuracy = 0.43263713421540684,
 Test classification accuracy = 0.4018072739040601


For top 10000 features selected:

Train classification accuracy = 0.43744165494579357,
 Test classification accuracy = 0.40454556769809064


For top 11000 features selected:

Train classification accuracy = 0.4415864875966194,
 Test classification accuracy = 0.4056657787956486


For top 12000 features sele

#### Unigrams and Bigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(processed_split_description, 1, 'description', ComplementNB, CountVectorizer, ngram=(1, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.40695908689212235,
 Test classification accuracy = 0.3874436782753728


For top 6000 features selected:

Train classification accuracy = 0.41407873937342077,
 Test classification accuracy = 0.3920240969853875


For top 7000 features selected:

Train classification accuracy = 0.41708467656613685,
 Test classification accuracy = 0.3938911154813174


For top 8000 features selected:

Train classification accuracy = 0.41768212992121084,
 Test classification accuracy = 0.39386622190137166


For top 9000 features selected:

Train classification accuracy = 0.4180057504885426,
 Test classification accuracy = 0.39386622190137166


For top 10000 features selected:

Train classification accuracy = 0.41813644340996503,
 Test classification accuracy = 0.39364217968186005


For top 11000 features selected:

Train classification accuracy = 0.4181302199375163,
 Test classification accuracy = 0.3936670732618058


For top 12000 features s

### TfidfVectorizer

#### Unigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(processed_split_description, 1, 'description', ComplementNB, TfidfVectorizer, topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.41239840181227516,
 Test classification accuracy = 0.39247218142441065


For top 6000 features selected:

Train classification accuracy = 0.42020885973537797,
 Test classification accuracy = 0.3984964277712778


For top 7000 features selected:

Train classification accuracy = 0.42447816183517756,
 Test classification accuracy = 0.39869557641084363


For top 8000 features selected:

Train classification accuracy = 0.43011662787368843,
 Test classification accuracy = 0.40220557118319183


For top 9000 features selected:

Train classification accuracy = 0.4338382643980035,
 Test classification accuracy = 0.4035498245002614


For top 10000 features selected:

Train classification accuracy = 0.44017375935076736,
 Test classification accuracy = 0.40613875681461753


For top 11000 features selected:

Train classification accuracy = 0.4448351402148343,
 Test classification accuracy = 0.40790620099076447


For top 12000 features

#### Unigrams and Bigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(processed_split_description, 1, 'description', ComplementNB, TfidfVectorizer, ngram=(1, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.40610024769420344,
 Test classification accuracy = 0.3863732543377063


For top 6000 features selected:

Train classification accuracy = 0.4130892072540795,
 Test classification accuracy = 0.39053048218864356


For top 7000 features selected:

Train classification accuracy = 0.41386714131016544,
 Test classification accuracy = 0.3904806950287521


For top 8000 features selected:

Train classification accuracy = 0.4141845384050485,
 Test classification accuracy = 0.39082920514799235


For top 9000 features selected:

Train classification accuracy = 0.4142281027121893,
 Test classification accuracy = 0.39120260884717833


For top 10000 features selected:

Train classification accuracy = 0.4150682714927621,
 Test classification accuracy = 0.39122750242712406


For top 11000 features selected:

Train classification accuracy = 0.41532965733560695,
 Test classification accuracy = 0.3914266510666899


For top 12000 features se

## Without preprocessing

### CountVectorizer

#### Unigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(raw_split_description, 0, 'description', ComplementNB, CountVectorizer, topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.41409740979076687,
 Test classification accuracy = 0.39645515421572775


For top 6000 features selected:

Train classification accuracy = 0.4198727922231488,
 Test classification accuracy = 0.3995170645490528


For top 7000 features selected:

Train classification accuracy = 0.4245403965596644,
 Test classification accuracy = 0.4004630205869906


For top 8000 features selected:

Train classification accuracy = 0.42956273882575524,
 Test classification accuracy = 0.4027781235219437


For top 9000 features selected:

Train classification accuracy = 0.4334710795235309,
 Test classification accuracy = 0.4040228025192303


For top 10000 features selected:

Train classification accuracy = 0.4378586275998556,
 Test classification accuracy = 0.4056159916357571


For top 11000 features selected:

Train classification accuracy = 0.4419287785812972,
 Test classification accuracy = 0.40743322297179557


For top 12000 features selec

#### Unigrams and bigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(raw_split_description, 0, 'description', ComplementNB, CountVectorizer, ngram=(1, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.4007605083332296,
 Test classification accuracy = 0.38525304324014836


For top 6000 features selected:

Train classification accuracy = 0.40794239553901496,
 Test classification accuracy = 0.3890866545517911


For top 7000 features selected:

Train classification accuracy = 0.41379868311322987,
 Test classification accuracy = 0.3923228199447363


For top 8000 features selected:

Train classification accuracy = 0.4192379980333827,
 Test classification accuracy = 0.3954594110178985


For top 9000 features selected:

Train classification accuracy = 0.42258622621077657,
 Test classification accuracy = 0.39735132309377413


For top 10000 features selected:

Train classification accuracy = 0.42347618277093885,
 Test classification accuracy = 0.3973264295138284


For top 11000 features selected:

Train classification accuracy = 0.4239180493147957,
 Test classification accuracy = 0.3971023872943168


For top 12000 features sel

### TfidfVectorizer

#### Unigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(raw_split_description, 0, 'description', ComplementNB, TfidfVectorizer, topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.4191570928915498,
 Test classification accuracy = 0.3995419581289985


For top 6000 features selected:

Train classification accuracy = 0.4258971135534783,
 Test classification accuracy = 0.4042717383186876


For top 7000 features selected:

Train classification accuracy = 0.43065806997672423,
 Test classification accuracy = 0.4059645017549974


For top 8000 features selected:

Train classification accuracy = 0.43534434473058586,
 Test classification accuracy = 0.40748301013168703


For top 9000 features selected:

Train classification accuracy = 0.4396136468303855,
 Test classification accuracy = 0.4090015185083767


For top 10000 features selected:

Train classification accuracy = 0.44410699393833786,
 Test classification accuracy = 0.41004704886609744


For top 11000 features selected:

Train classification accuracy = 0.448843056471789,
 Test classification accuracy = 0.4118144930422444


For top 12000 features selec

#### Unigrams and bigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(raw_split_description, 0, 'description', ComplementNB, TfidfVectorizer, ngram=(1, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.40422698248714856,
 Test classification accuracy = 0.38729431679569837


For top 6000 features selected:

Train classification accuracy = 0.4120623343000461,
 Test classification accuracy = 0.3930696273431082


For top 7000 features selected:

Train classification accuracy = 0.41786883409467146,
 Test classification accuracy = 0.3970028129745339


For top 8000 features selected:

Train classification accuracy = 0.4230031988648386,
 Test classification accuracy = 0.39961663886883575


For top 9000 features selected:

Train classification accuracy = 0.4237002277790916,
 Test classification accuracy = 0.3997660003485101


For top 10000 features selected:

Train classification accuracy = 0.42402384834642337,
 Test classification accuracy = 0.3996664260287272


For top 11000 features selected:

Train classification accuracy = 0.4244968322525236,
 Test classification accuracy = 0.3996415324487815


For top 12000 features sele

# Classyfying on headline

## With preprocessing

### CountVectorizer

#### Unigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(processed_split_headline, 1, 'headline', ComplementNB, CountVectorizer, topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.5389464905838862,
 Test classification accuracy = 0.5149237011774663


For top 6000 features selected:

Train classification accuracy = 0.5466262555855665,
 Test classification accuracy = 0.5201513529660701


For top 7000 features selected:

Train classification accuracy = 0.5557934305024832,
 Test classification accuracy = 0.5256279405541311


For top 8000 features selected:

Train classification accuracy = 0.5611580637532517,
 Test classification accuracy = 0.5274202783102238


For top 9000 features selected:

Train classification accuracy = 0.5674002066192853,
 Test classification accuracy = 0.5318513355405641


For top 10000 features selected:

Train classification accuracy = 0.5736050086506267,
 Test classification accuracy = 0.5335192053969281


For top 11000 features selected:

Train classification accuracy = 0.5791065582952665,
 Test classification accuracy = 0.5356849468522068


For top 12000 features selected:

#### Unigrams and bigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(processed_split_headline, 1, 'headline', ComplementNB, CountVectorizer, ngram=(1, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.5239541454549981,
 Test classification accuracy = 0.5039705260013443


For top 6000 features selected:

Train classification accuracy = 0.535206183642225,
 Test classification accuracy = 0.5119613651639242


For top 7000 features selected:

Train classification accuracy = 0.5444729341183207,
 Test classification accuracy = 0.5183092280500858


For top 8000 features selected:

Train classification accuracy = 0.5533724997199437,
 Test classification accuracy = 0.523586666998581


For top 9000 features selected:

Train classification accuracy = 0.5551150720055762,
 Test classification accuracy = 0.5234870926787981


For top 10000 features selected:

Train classification accuracy = 0.5559427938412517,
 Test classification accuracy = 0.523586666998581


For top 11000 features selected:

Train classification accuracy = 0.5586126635217386,
 Test classification accuracy = 0.5247815588359762


For top 12000 features selected:

T

### TfidfVectorizer

#### Unigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(processed_split_headline, 1, 'headline', ComplementNB, TfidfVectorizer, topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.5403903361919816,
 Test classification accuracy = 0.5166413581937218


For top 6000 features selected:

Train classification accuracy = 0.5480763246661107,
 Test classification accuracy = 0.5227153917004804


For top 7000 features selected:

Train classification accuracy = 0.5568949851259009,
 Test classification accuracy = 0.5266734709118518


For top 8000 features selected:

Train classification accuracy = 0.5628757421490895,
 Test classification accuracy = 0.5287396380473476


For top 9000 features selected:

Train classification accuracy = 0.56842085610087,
 Test classification accuracy = 0.5309551666625177


For top 10000 features selected:

Train classification accuracy = 0.5755031677474763,
 Test classification accuracy = 0.5342411192153543


For top 11000 features selected:

Train classification accuracy = 0.5808740244706937,
 Test classification accuracy = 0.5355853725324239


For top 12000 features selected:



#### Unigrams and bigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(processed_split_headline, 1, 'headline', ComplementNB, TfidfVectorizer, ngram=(1, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.5236927596121532,
 Test classification accuracy = 0.50399541958129


For top 6000 features selected:

Train classification accuracy = 0.5354488990677239,
 Test classification accuracy = 0.5125090239227303


For top 7000 features selected:

Train classification accuracy = 0.5453504437335855,
 Test classification accuracy = 0.5180105050907371


For top 8000 features selected:

Train classification accuracy = 0.5519411010567457,
 Test classification accuracy = 0.5217943292424884


For top 9000 features selected:

Train classification accuracy = 0.5527874933097671,
 Test classification accuracy = 0.5218690099823255


For top 10000 features selected:

Train classification accuracy = 0.5540197408546073,
 Test classification accuracy = 0.5222424136815116


For top 11000 features selected:

Train classification accuracy = 0.5581832439227792,
 Test classification accuracy = 0.5245575166164647


For top 12000 features selected:



## Without preprocessing

### CountVectorizer

#### Unigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(raw_split_headline, 0, 'headline', ComplementNB, CountVectorizer, topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.549563734581347,
 Test classification accuracy = 0.5273704911503323


For top 6000 features selected:

Train classification accuracy = 0.5572808404177195,
 Test classification accuracy = 0.5318264419606183


For top 7000 features selected:

Train classification accuracy = 0.5644440572061588,
 Test classification accuracy = 0.5353115431530209


For top 8000 features selected:

Train classification accuracy = 0.5705181663160778,
 Test classification accuracy = 0.5383734534863459


For top 9000 features selected:

Train classification accuracy = 0.5754782738576816,
 Test classification accuracy = 0.5411864280202136


For top 10000 features selected:

Train classification accuracy = 0.5807495550217199,
 Test classification accuracy = 0.5428791914565234


For top 11000 features selected:

Train classification accuracy = 0.5857905677051568,
 Test classification accuracy = 0.5450200393318563


For top 12000 features selected:


#### Unigrams and Bigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(raw_split_headline, 0, 'headline', ComplementNB, CountVectorizer, ngram=(1, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.5332457898208884,
 Test classification accuracy = 0.5152722112967065


For top 6000 features selected:

Train classification accuracy = 0.5428237139194185,
 Test classification accuracy = 0.5210724154240621


For top 7000 features selected:

Train classification accuracy = 0.5502296461333566,
 Test classification accuracy = 0.5252794304348909


For top 8000 features selected:

Train classification accuracy = 0.5581645735054331,
 Test classification accuracy = 0.530556869383386


For top 9000 features selected:

Train classification accuracy = 0.5648610298602208,
 Test classification accuracy = 0.5341166513156257


For top 10000 features selected:

Train classification accuracy = 0.5697713496222352,
 Test classification accuracy = 0.5375021781882453


For top 11000 features selected:

Train classification accuracy = 0.5711342900884978,
 Test classification accuracy = 0.5373279231286251


For top 12000 features selected:


### TfidfVectorizer

#### Unigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(raw_split_headline, 0, 'headline', ComplementNB, TfidfVectorizer, topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.5500802827945881,
 Test classification accuracy = 0.5286151701476189


For top 6000 features selected:

Train classification accuracy = 0.5574177568115906,
 Test classification accuracy = 0.5320504841801299


For top 7000 features selected:

Train classification accuracy = 0.5649108176398103,
 Test classification accuracy = 0.5347638843942147


For top 8000 features selected:

Train classification accuracy = 0.5708791277181016,
 Test classification accuracy = 0.5370043065893306


For top 9000 features selected:

Train classification accuracy = 0.5762624313862162,
 Test classification accuracy = 0.5396181324836324


For top 10000 features selected:

Train classification accuracy = 0.5813034440696531,
 Test classification accuracy = 0.541435363819671


For top 11000 features selected:

Train classification accuracy = 0.5868361110765362,
 Test classification accuracy = 0.5442234447735929


For top 12000 features selected:


#### Unigrams and Bigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(raw_split_headline, 0, 'headline', ComplementNB, TfidfVectorizer, ngram=(1, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.5337436676167835,
 Test classification accuracy = 0.5151726369769236


For top 6000 features selected:

Train classification accuracy = 0.5422324840367931,
 Test classification accuracy = 0.5204998630853103


For top 7000 features selected:

Train classification accuracy = 0.5507337474017002,
 Test classification accuracy = 0.5263249607926116


For top 8000 features selected:

Train classification accuracy = 0.5577164834891276,
 Test classification accuracy = 0.5313783575215952


For top 9000 features selected:

Train classification accuracy = 0.5641764478908652,
 Test classification accuracy = 0.5355355853725324


For top 10000 features selected:

Train classification accuracy = 0.5677673914937579,
 Test classification accuracy = 0.5376515396679197


For top 11000 features selected:

Train classification accuracy = 0.5688129348651374,
 Test classification accuracy = 0.5374026038684623


For top 12000 features selected:

# Classifying on full text (headline + short description)

## With preprocessing

### CountVectorizer

#### Unigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(processed_split_full_text, 1, 'full_text', ComplementNB, CountVectorizer, topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.5555693854943304,
 Test classification accuracy = 0.5393691966841752


For top 6000 features selected:

Train classification accuracy = 0.5633113852204976,
 Test classification accuracy = 0.54429812551343


For top 7000 features selected:

Train classification accuracy = 0.5664853561693283,
 Test classification accuracy = 0.5449702521719648


For top 8000 features selected:

Train classification accuracy = 0.5683772917937292,
 Test classification accuracy = 0.545144507231585


For top 9000 features selected:

Train classification accuracy = 0.5723105263812998,
 Test classification accuracy = 0.5465136541286002


For top 10000 features selected:

Train classification accuracy = 0.5746567754944549,
 Test classification accuracy = 0.5468621642478405


For top 11000 features selected:

Train classification accuracy = 0.577183505308622,
 Test classification accuracy = 0.5473102486868636


For top 12000 features selected:

Tr

#### Unigrams and bigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(processed_split_full_text, 1, 'full_text', ComplementNB, CountVectorizer, topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.5555693854943304,
 Test classification accuracy = 0.5393691966841752


For top 6000 features selected:

Train classification accuracy = 0.5633113852204976,
 Test classification accuracy = 0.54429812551343


For top 7000 features selected:

Train classification accuracy = 0.5664853561693283,
 Test classification accuracy = 0.5449702521719648


For top 8000 features selected:

Train classification accuracy = 0.5683772917937292,
 Test classification accuracy = 0.545144507231585


For top 9000 features selected:

Train classification accuracy = 0.5723105263812998,
 Test classification accuracy = 0.5465136541286002


For top 10000 features selected:

Train classification accuracy = 0.5746567754944549,
 Test classification accuracy = 0.5468621642478405


For top 11000 features selected:

Train classification accuracy = 0.577183505308622,
 Test classification accuracy = 0.5473102486868636


For top 12000 features selected:

Tr

### TfidfVectorizer

#### Unigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(processed_split_full_text, 1, 'full_text', ComplementNB, TfidfVectorizer, topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.5590483065931467,
 Test classification accuracy = 0.541236215180105


For top 6000 features selected:

Train classification accuracy = 0.5668587645162495,
 Test classification accuracy = 0.5460406761096314


For top 7000 features selected:

Train classification accuracy = 0.569597092393672,
 Test classification accuracy = 0.5465883348684374


For top 8000 features selected:

Train classification accuracy = 0.5710409380017675,
 Test classification accuracy = 0.5465883348684374


For top 9000 features selected:

Train classification accuracy = 0.5750861950934143,
 Test classification accuracy = 0.5482313111448558


For top 10000 features selected:

Train classification accuracy = 0.5769968011351614,
 Test classification accuracy = 0.5484055662044759


For top 11000 features selected:

Train classification accuracy = 0.5794364023350469,
 Test classification accuracy = 0.5486793955838789


For top 12000 features selected:



#### Unigrams and bigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(processed_split_full_text, 1, 'full_text', ComplementNB, TfidfVectorizer, ngram=(1, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.5482879227293661,
 Test classification accuracy = 0.5352617559931294


For top 6000 features selected:

Train classification accuracy = 0.5570194545748746,
 Test classification accuracy = 0.5416842996191282


For top 7000 features selected:

Train classification accuracy = 0.5628632952041921,
 Test classification accuracy = 0.5453436558711509


For top 8000 features selected:

Train classification accuracy = 0.5690432033457388,
 Test classification accuracy = 0.5477832267058326


For top 9000 features selected:

Train classification accuracy = 0.5741775681159059,
 Test classification accuracy = 0.5510940728386149


For top 10000 features selected:

Train classification accuracy = 0.5791190052401638,
 Test classification accuracy = 0.553409175773568


For top 11000 features selected:

Train classification accuracy = 0.5838986320807558,
 Test classification accuracy = 0.556844489806079


For top 12000 features selected:



## Without preprocessing

### CountVectorizer

#### Unigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(raw_split_full_text, 0, 'full_text', ComplementNB, CountVectorizer, topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.5576418018197433,
 Test classification accuracy = 0.5433272758955465


For top 6000 features selected:

Train classification accuracy = 0.5648921472224643,
 Test classification accuracy = 0.5474596101665381


For top 7000 features selected:

Train classification accuracy = 0.5679292017774238,
 Test classification accuracy = 0.5481317368250728


For top 8000 features selected:

Train classification accuracy = 0.5702007692211947,
 Test classification accuracy = 0.548579821264096


For top 9000 features selected:

Train classification accuracy = 0.5733871871149226,
 Test classification accuracy = 0.549973861741057


For top 10000 features selected:

Train classification accuracy = 0.5763931243076387,
 Test classification accuracy = 0.5511438599985063


For top 11000 features selected:

Train classification accuracy = 0.5806375325176435,
 Test classification accuracy = 0.5529610913345449


For top 12000 features selected:



#### Unigrams and bigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(raw_split_full_text, 0, 'full_text', ComplementNB, CountVectorizer, ngram=(1, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.5456927347182634,
 Test classification accuracy = 0.5342162256354086


For top 6000 features selected:

Train classification accuracy = 0.5516859386863494,
 Test classification accuracy = 0.5384232406462374


For top 7000 features selected:

Train classification accuracy = 0.5576044609850512,
 Test classification accuracy = 0.5430783400960892


For top 8000 features selected:

Train classification accuracy = 0.5636101118980347,
 Test classification accuracy = 0.5470364193074606


For top 9000 features selected:

Train classification accuracy = 0.5685391020773951,
 Test classification accuracy = 0.5495755644619252


For top 10000 features selected:

Train classification accuracy = 0.5727959572322974,
 Test classification accuracy = 0.5515421572776381


For top 11000 features selected:

Train classification accuracy = 0.5768474377963929,
 Test classification accuracy = 0.5533344950337308


For top 12000 features selected:

### TfidfVectorizer

#### Unigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(raw_split_full_text, 0, 'full_text', ComplementNB, TfidfVectorizer, topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.5659625844836385,
 Test classification accuracy = 0.548579821264096


For top 6000 features selected:

Train classification accuracy = 0.5728146276496434,
 Test classification accuracy = 0.5535087500933509


For top 7000 features selected:

Train classification accuracy = 0.5748621500852615,
 Test classification accuracy = 0.5538572602125912


For top 8000 features selected:

Train classification accuracy = 0.5772955278126983,
 Test classification accuracy = 0.5542057703318314


For top 9000 features selected:

Train classification accuracy = 0.58036992320235,
 Test classification accuracy = 0.5549525777302033


For top 10000 features selected:

Train classification accuracy = 0.5831518153869133,
 Test classification accuracy = 0.555176619949715


For top 11000 features selected:

Train classification accuracy = 0.5877136206918012,
 Test classification accuracy = 0.5575415100445595


For top 12000 features selected:

Tr

#### Unigrams and Bigrams

In [None]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(raw_split_full_text, 0, 'full_text', ComplementNB, TfidfVectorizer, ngram=(1, 2), topk=i)
    print('\n')

For top 5000 features selected:

Train classification accuracy = 0.5518975367496048,
 Test classification accuracy = 0.5409374922207563


For top 6000 features selected:

Train classification accuracy = 0.5580649979462541,
 Test classification accuracy = 0.5462896119090886


For top 7000 features selected:

Train classification accuracy = 0.5636723466225215,
 Test classification accuracy = 0.5491274800229021


For top 8000 features selected:

Train classification accuracy = 0.569877148653863,
 Test classification accuracy = 0.5529859849144906


For top 9000 features selected:

Train classification accuracy = 0.5738850649108176,
 Test classification accuracy = 0.555176619949715


For top 10000 features selected:

Train classification accuracy = 0.5777125004667605,
 Test classification accuracy = 0.5575664036245053


For top 11000 features selected:

Train classification accuracy = 0.5817826514482021,
 Test classification accuracy = 0.559532996440218


For top 12000 features selected:

T

# Checking top 5 accuracies among the results and saving the results table to .csv file

In [None]:
results_df.to_csv('/content/drive/MyDrive/SDLC/news_analysis_project/resultsCNB.csv')
print(results_df.sort_values(by='TestAccuracy', ascending=False).head(5))

       Classifier         By  ... TrainAccuracy TestAccuracy
239  ComplementNB  full_text  ...      0.591883     0.563068
238  ComplementNB  full_text  ...      0.588797     0.562197
237  ComplementNB  full_text  ...      0.585392     0.560678
236  ComplementNB  full_text  ...      0.581783     0.559533
229  ComplementNB  full_text  ...      0.595182     0.558836

[5 rows x 8 columns]
