<a href="https://colab.research.google.com/github/romapavelko01/NLP_SDLC_project/blob/classifications/checking_results_ML_tuned_at_diff_combos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
os.chdir('/content/drive/MyDrive/SDLC/news_analysis_project')

# Dataframe for results, with combinations of vectorizers/classifiers/tuned hyperparameter values

# Reading .csv file with best combinations of parameters

## With best combinations received after classification on top 5 most frequent categories

In [5]:
tuned_5 = pd.read_csv('results_for_top_5categories/TunedResultsforTop5categories.csv')
tuned_5 = tuned_5.iloc[:, 1:-1]
tuned_5

Unnamed: 0,Classifier,By,Preprocessed,Vectorizer,Ngram,TopKFeatures,TrainAccuracy,TestAccuracy
0,LinearSVC,full_text,0,TfidfVectorizer,"(1, 1)",7500,,0.905931
1,MultinomialNB,full_text,0,CountVectorizer,"(1, 1)",7500,,0.8862
2,ComplementNB,full_text,0,TfidfVectorizer,"(1, 1)",7500,,0.89003
3,XGBClassifier,full_text,0,CountVectorizer,"(1, 2)",4500,,0.818593
4,PassiveAggressiveClassifier,full_text,0,TfidfVectorizer,"(1, 2)",7500,,0.899721
5,SGDClassifier,full_text,0,CountVectorizer,"(1, 1)",7500,,0.904828
6,RandomForestClassifier,headline,1,TfidfVectorizer,"(1, 1)",7500,,0.820624


## With best combinations received after classification on all categories (full dataset)

In [6]:
maxes_df = pd.DataFrame(columns=['Classifier', 'By', 'Preprocessed', 
                                 'Vectorizer', 'Ngram', 'TopKFeatures', 
                                 'TrainAccuracy', 'TestAccuracy'])
for filename in os.listdir(os.getcwd()):
    if '.csv' in filename and '5' not in filename:
        try:
          best = pd.read_csv(filename).sort_values(by='TestAccuracy', ascending=False).iloc[:1]
          maxes_df = maxes_df.append(best, ignore_index=True)
        except:
          print(filename)

maxes_df = maxes_df.iloc[:, :-1]
maxes_df.sort_values(by='TestAccuracy', ascending=False)

Unnamed: 0,Classifier,By,Preprocessed,Vectorizer,Ngram,TopKFeatures,TrainAccuracy,TestAccuracy
2,LinearSVC,full_text,1,TfidfVectorizer,"(1, 1)",12000,0.687725,0.58587
0,MultinomialNB,full_text,0,CountVectorizer,"(1, 1)",10000,0.629797,0.573946
1,ComplementNB,full_text,0,TfidfVectorizer,"(1, 2)",14000,0.591883,0.563068
6,SGDClassifier,full_text,1,CountVectorizer,"(1, 1)",12000,0.634987,0.561524
3,PassiveAggressiveClassifier,full_text,1,TfidfVectorizer,"(1, 1)",12000,0.696077,0.557591
5,RandomForestClassifier,full_text,1,TfidfVectorizer,"(1, 1)",12000,0.998786,0.512957
4,XGBClassifier,full_text,1,TfidfVectorizer,"(1, 1)",9000,0.485923,0.457469


# Reading news dataset and cleaning it

In [7]:
filename = "data/final_news_category_dataset.json"
df = pd.read_json(filename, orient='split')
df.head(3)

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26


In [8]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
import re
import string


def cleaning_function(sentence):
    """
    Function needed to perform data preprocessing: removing punctuation symbols,
    stop_words and other random things in order to obtain clean text
    """
    # the following line removes numbers from text
    result = re.sub(r'\d+', '', sentence.lower())

    # the following line removes any punctuation from the text
    result = result.translate(str.maketrans('','',string.punctuation))
    return [word for word in result.split() if not word in stop_words]

In [10]:
df['processed_description'] = df['short_description'].apply(lambda x: ' '.join(cleaning_function(x)))
df['processed_headline'] = df['headline'].apply(lambda x: ' '.join(cleaning_function(x)))
df['full_text'] = df['headline'] + df['short_description']
df['processed_full_text'] = df['processed_headline'] + df['processed_description']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'category'], df['category'], 
                                                    test_size=0.2, stratify=df.category,
                                                    random_state=1)

# Testing models

## Helper functions 

In [12]:
tuned_df_all = pd.DataFrame(columns=['Classifier', 'By', 'Preprocessed', 
                                 'Vectorizer', 'Ngram', 'TopKFeatures',
                                 'Best_parameters', 'TestAccuracy'])

In [13]:
tuned_df_all_with_all = pd.DataFrame(columns=['Classifier', 'By', 'Preprocessed', 
                                 'Vectorizer', 'Ngram', 'TopKFeatures',
                                 'Best_parameters', 'TestAccuracy'])

In [14]:
def row_map_clf(clf_name, top5=True):
    res_df = tuned_5 if top5 else maxes_df
    return res_df[tuned_5.Classifier == clf_name]


def test_model(clf_name, params, top5=True, print_results=True):
    global tuned_df_all
    global tuned_df_all_with_all

    to_change = row_map_clf(clf_name, top5)
    row = to_change.iloc[0]
    by, prepr, vect, ngram, topk = row[1:-2]
    what_to_take = ("processed_" if prepr else "") + by

    X_train_this, X_test_this, y_train_this, y_test_this = (X_train[what_to_take], 
                                                            X_test[what_to_take], 
                                                            y_train, 
                                                            y_test)

    vect = eval(f"{vect}()")
    vect = vect.__class__(ngram_range=eval(ngram))
    x_train_ = vect.fit_transform(X_train_this)

    # Vectorize validation texts.
    # current_params = dict(zip(params_names, item))
    clf = eval(f"{clf_name}()")
    clf.set_params(**best_params)
    
    x_val = vect.transform(X_test_this)
    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(topk, x_train_.shape[1]))
    selector.fit(x_train_, y_train_this)
    x_train = selector.transform(x_train_).astype('float32')
    x_val = selector.transform(x_val).astype('float32')   

    clf.fit(x_train, y_train_this)
    y_pred_test = clf.predict(x_val)
    test_acc = accuracy_score(y_test_this, y_pred_test)
    if print_results:
        print("Test accuracy on the tuned model on ALL categories: ", test_acc)
    to_change["TestAccuracy"] = test_acc
    to_change["Best_parameters"] = str(params)
    to_change = to_change[[i for i in to_change.columns if i != 'TrainAccuracy']]
    if top5:
        tuned_df_all = tuned_df_all.append(to_change)
    else:
        tuned_df_all_with_all = tuned_df_all_with_all.append(to_change)
    

## Testing models at best parameters, selected by classification on top 5 most frequent categories, and with the best combinations received in the same way

### Multinomial NB

In [15]:
best_params = {'alpha': 1.0, 'fit_prior': True}
test_model("MultinomialNB", best_params)

Test accuracy on the tuned model on ALL categories:  0.5626695875133803


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### LinearSVC

In [16]:
best_params = {'dual': True, 'max_iter': 4000, 'C': 1}
test_model("LinearSVC", best_params)

Test accuracy on the tuned model on ALL categories:  0.6023499539468771


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### ComplementNB

In [17]:
best_params = {'alpha': 1.0, 'fit_prior': True, 'norm': False}
test_model("ComplementNB", best_params)

Test accuracy on the tuned model on ALL categories:  0.553782579472754


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### XGBClassifier

In [18]:
best_params = {'learning_rate': 0.15, 'max_depth': 4, 'n_estimators': 150, 'n_jobs': -1}
test_model("XGBClassifier", best_params)

Test accuracy on the tuned model on ALL categories:  0.5397923875432526


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### RandomForestClassifier

In [19]:
best_params = {'n_jobs': -1, 'max_depth': None}
test_model("RandomForestClassifier", best_params)

Test accuracy on the tuned model on ALL categories:  0.4828607702073635


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### PassiveAggressiveClassifier

In [20]:
best_params = {'max_iter': 2500, 'C': 0.5, 'n_jobs': -1}
test_model("PassiveAggressiveClassifier", best_params)

Test accuracy on the tuned model on ALL categories:  0.5795972218764781


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### SGDClassifier

In [21]:
best_params = {'max_iter': 2500, 'n_jobs': -1, 'learning_rate': 'adaptive', 'eta0': 0.1}
test_model("SGDClassifier", best_params)

Test accuracy on the tuned model on ALL categories:  0.5887082721366159


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## Checking final tuned_df_all and saving it to .csv file

In [22]:
tuned_df_all

Unnamed: 0,Classifier,By,Preprocessed,Vectorizer,Ngram,TopKFeatures,Best_parameters,TestAccuracy
1,MultinomialNB,full_text,0,CountVectorizer,"(1, 1)",7500,"{'alpha': 1.0, 'fit_prior': True}",0.56267
0,LinearSVC,full_text,0,TfidfVectorizer,"(1, 1)",7500,"{'dual': True, 'max_iter': 4000, 'C': 1}",0.60235
2,ComplementNB,full_text,0,TfidfVectorizer,"(1, 1)",7500,"{'alpha': 1.0, 'fit_prior': True, 'norm': False}",0.553783
3,XGBClassifier,full_text,0,CountVectorizer,"(1, 2)",4500,"{'learning_rate': 0.15, 'max_depth': 4, 'n_est...",0.539792
6,RandomForestClassifier,headline,1,TfidfVectorizer,"(1, 1)",7500,"{'n_jobs': -1, 'max_depth': None}",0.482861
4,PassiveAggressiveClassifier,full_text,0,TfidfVectorizer,"(1, 2)",7500,"{'max_iter': 2500, 'C': 0.5, 'n_jobs': -1}",0.579597
5,SGDClassifier,full_text,0,CountVectorizer,"(1, 1)",7500,"{'max_iter': 2500, 'n_jobs': -1, 'learning_rat...",0.588708


In [23]:
tuned_df_all.to_csv("results_after_tuning/TunedResultsforAllWithTop5Combos.csv")

## Testing models at best parameters, but with the best combinations after classifications on all categories

### MultinomialNB

In [24]:
best_params = {'alpha': 1.0, 'fit_prior': True}
test_model("MultinomialNB", best_params, top5=False)

Test accuracy on the tuned model on ALL categories:  0.3876179333349929


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### LinearSVC

In [25]:
best_params = {'dual': True, 'max_iter': 4000, 'C': 1}
test_model("LinearSVC", best_params, top5=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Test accuracy on the tuned model on ALL categories:  0.581364666052625


### ComplementNB

In [26]:
best_params = {'alpha': 1.0, 'fit_prior': True, 'norm': False}
test_model("ComplementNB", best_params, top5=False)

Test accuracy on the tuned model on ALL categories:  0.551367902218018


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### XGBClassifier

In [27]:
best_params = {'learning_rate': 0.15, 'max_depth': 4, 'n_estimators': 150, 'n_jobs': -1}
test_model("XGBClassifier", best_params, top5=False)

Test accuracy on the tuned model on ALL categories:  0.5200268850663414


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### RandomForestClassifier

In [28]:
best_params = {'n_jobs': -1, 'max_depth': None}
test_model("RandomForestClassifier", best_params, top5=False)

Test accuracy on the tuned model on ALL categories:  0.49737372731572527


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### PassiveAggressiveClassifier

In [29]:
best_params = {'max_iter': 2500, 'C': 0.5, 'n_jobs': -1}
test_model("PassiveAggressiveClassifier", best_params, top5=False)

Test accuracy on the tuned model on ALL categories:  0.5636653307112096


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### SGDClassifier

In [30]:
best_params = {'max_iter': 2500, 'n_jobs': -1, 'learning_rate': 'adaptive', 'eta0': 0.1}
test_model("SGDClassifier", best_params, top5=False)

Test accuracy on the tuned model on ALL categories:  0.5397177068034154


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## Checking final *tuned_df_all_wih_all* and saving it to .csv file

In [31]:
tuned_df_all_with_all

Unnamed: 0,Classifier,By,Preprocessed,Vectorizer,Ngram,TopKFeatures,Best_parameters,TestAccuracy
1,ComplementNB,full_text,0,TfidfVectorizer,"(1, 2)",14000,"{'alpha': 1.0, 'fit_prior': True}",0.387618
0,MultinomialNB,full_text,0,CountVectorizer,"(1, 1)",10000,"{'dual': True, 'max_iter': 4000, 'C': 1}",0.581365
2,LinearSVC,full_text,1,TfidfVectorizer,"(1, 1)",12000,"{'alpha': 1.0, 'fit_prior': True, 'norm': False}",0.551368
3,PassiveAggressiveClassifier,full_text,1,TfidfVectorizer,"(1, 1)",12000,"{'learning_rate': 0.15, 'max_depth': 4, 'n_est...",0.520027
6,SGDClassifier,full_text,1,CountVectorizer,"(1, 1)",12000,"{'n_jobs': -1, 'max_depth': None}",0.497374
4,XGBClassifier,full_text,1,TfidfVectorizer,"(1, 1)",9000,"{'max_iter': 2500, 'C': 0.5, 'n_jobs': -1}",0.563665
5,RandomForestClassifier,full_text,1,TfidfVectorizer,"(1, 1)",12000,"{'max_iter': 2500, 'n_jobs': -1, 'learning_rat...",0.539718


In [32]:
tuned_df_all.to_csv("results_after_tuning/TunedResultsforAllWithAllCombos.csv")