<a href="https://colab.research.google.com/github/romapavelko01/NLP_SDLC_project/blob/classifications/tuning_models_for_top5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import os
import numpy as np

Importing sklearn models, which were used for news classification

In [2]:
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
os.chdir('/content/drive/MyDrive/SDLC/news_analysis_project')

# Creating a dataframe with best combinations by test accuracy for each model type

In [9]:
maxes_df = pd.DataFrame(columns=['Classifier', 'By', 'Preprocessed', 
                                 'Vectorizer', 'Ngram', 'TopKFeatures', 
                                 'TrainAccuracy', 'TestAccuracy'])
fold_top5 = 'results_for_top_5categories'
for filename in os.listdir(fold_top5):
    if '.csv' in filename:
        best = pd.read_csv(fold_top5 + '/' +filename).sort_values(by='TestAccuracy', ascending=False).iloc[:1]
        maxes_df = maxes_df.append(best, ignore_index=True)

maxes_df.sort_values(by='TestAccuracy', ascending=False).head(5)

Unnamed: 0.1,Classifier,By,Preprocessed,Vectorizer,Ngram,TopKFeatures,TrainAccuracy,TestAccuracy,Unnamed: 0
0,LinearSVC,full_text,0,TfidfVectorizer,"(1, 1)",7500,0.946625,0.905931,0.0
2,SGDClassifier,full_text,0,CountVectorizer,"(1, 1)",7500,0.938632,0.901578,0.0
1,PassiveAggressiveClassifier,full_text,0,TfidfVectorizer,"(1, 2)",7500,0.949382,0.896936,0.0
6,ComplementNB,full_text,0,TfidfVectorizer,"(1, 1)",7500,0.903842,0.89003,0.0
5,MultinomialNB,full_text,0,CountVectorizer,"(1, 1)",7500,0.904277,0.8862,0.0


# Preparation steps

## Creating a dataframe where accuracies of tuned models will be stored 

In [10]:
tuned_df = pd.DataFrame(columns=maxes_df.columns)

## Reading the final news dataset, performing data preprocessing and splitting it into final train/test on which performance will be measured

In [11]:
filename = "data/final_news_category_dataset.json"
df = pd.read_json(filename, orient='split')
df.head(3)

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26


In [13]:
top5_df = df[df.category.isin(df.category.value_counts().index[:5])]
print("Number of elements in the the dataset with top 5 most frequent categories: ", len(top5_df))
print(top5_df.category.value_counts(normalize=True))

Number of elements in the the dataset with top 5 most frequent categories:  86160
POLITICS          0.379979
WELLNESS          0.206906
ENTERTAINMENT     0.186374
TRAVEL            0.114752
STYLE & BEAUTY    0.111989
Name: category, dtype: float64


In [14]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [15]:
import re
import string


def cleaning_function(sentence):
    """
    Function needed to perform data preprocessing: removing punctuation symbols,
    stop_words and other random things in order to obtain clean text
    """
    # the following line removes numbers from text
    result = re.sub(r'\d+', '', sentence.lower())

    # the following line removes any punctuation from the text
    result = result.translate(str.maketrans('','',string.punctuation))
    return [word for word in result.split() if not word in stop_words]

In [None]:
top5_df['processed_description'] = top5_df['short_description'].apply(lambda x: ' '.join(cleaning_function(x)))
top5_df['processed_headline'] = top5_df['headline'].apply(lambda x: ' '.join(cleaning_function(x)))
top5_df['full_text'] = top5_df['headline'] + top5_df['short_description']
top5_df['processed_full_text'] = top5_df['processed_headline'] + top5_df['processed_description']

## Splitting into train/test data sets

In [17]:
X_train, X_test, y_train, y_test = train_test_split(top5_df.loc[:, top5_df.columns != 'category'], top5_df['category'], 
                                                    test_size=0.2, stratify=top5_df.category,
                                                    random_state=1)


In [18]:
def map_datarow_to_comb(classifier_name):
    """
    Function for mapping row
    """
    cl_row =  maxes_df[maxes_df.Classifier == classifier_name][['Preprocessed', 
                                                                'Vectorizer', 
                                                                'Ngram', 
                                                                'TopKFeatures', 'By']].values[0]
                                          
    return [int(cl_row[0]), eval(f'{classifier_name}()'), eval(f"{cl_row[1]}(ngram_range={cl_row[2]})"), int(cl_row[3]), cl_row[4]] 

## Generating customised folds to feed to GridSearchCV, for reasons that each X fold must be separately fit by vectorizers

In [19]:
from sklearn.model_selection import KFold
n_folds = 2
train_test_splits = []
for train_index, test_index in KFold(n_splits=n_folds).split(X_train):
    train_test_splits.append([X_train.iloc[train_index], X_train.iloc[test_index], y_train.iloc[train_index], y_train.iloc[test_index]])

# Tuning process

## Creating a function to test a model

In [20]:
def test_model(data, model, vect, topk, params):
    X_train_this, X_test_this, y_train_this, y_test_this = data
    x_train_ = vect.fit_transform(X_train_this)

    # Vectorize validation texts.
    # current_params = dict(zip(params_names, item))
    clf = model.__class__(**params)
    x_val = vect.transform(X_test_this)
    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(topk, x_train_.shape[1]))
    selector.fit(x_train_, y_train_this)
    x_train = selector.transform(x_train_).astype('float32')
    x_val = selector.transform(x_val).astype('float32')   

    clf.fit(x_train, y_train_this)
    y_pred_test = clf.predict(x_val)
    test_acc = accuracy_score(y_test_this, y_pred_test)
    return test_acc

## Creating a function to which hyperparameters dict will be fed, the model type and corresponding best-accuracy combination (from *maxes_df* dataframe) will be specified

In [21]:
import itertools


def tune_model(params_to_tune, model_name, print_results=True):
    """
    Function to tune hyperparameters for any ML model type considered before,
    Parameters:
    -----------
    params_to_tune: dict, !SHOULD CONTAIN DEFAULT PARAMETERS VALUES!
    model_name: str, specifies what model is passed
    print_results: bool, compare results before and after tuning
    """
    global tuned_df
    global train_test_splits
    pr, model, vect, topk, by = map_datarow_to_comb(model_name)
    data_to_use = ('processed_' if pr else '') + by
    print("Ok")
    
    best_params = None
    best_accuracy = 0
    params_names = list(params_to_tune.keys())
    a = []
    for key in params_to_tune:
        a.append(params_to_tune[key])
    all = itertools.product(*a)
    for item in all:
        history = []   
        for fold in train_test_splits:
            X_train_curr, X_test_curr, y_train_curr, y_test_curr = fold
            x_train_ = vect.fit_transform(X_train_curr[data_to_use])

            # Vectorize validation texts.
            current_params = dict(zip(params_names, item))
            clf = model.__class__(**current_params)
            x_val = vect.transform(X_test_curr[data_to_use])
            # Select top 'k' of the vectorized features.
            selector = SelectKBest(f_classif, k=min(topk, x_train_.shape[1]))
            selector.fit(x_train_, y_train_curr)
            x_train = selector.transform(x_train_).astype('float32')
            x_val = selector.transform(x_val).astype('float32')   

            clf.fit(x_train, y_train_curr)
            y_pred_test = clf.predict(x_val)
            test_acc = accuracy_score(y_test_curr, y_pred_test)
            history.append(test_acc)
        if np.mean(history) > best_accuracy:
            best_accuracy = np.mean(history)
            best_params = current_params

    print("OK")
    data = (X_train[data_to_use], X_test[data_to_use], y_train, y_test)
    print("OK")
    tuned_test_acc = test_model(data, model, vect, topk, best_params)
    to_change = maxes_df[maxes_df['Classifier'] == model_name].copy()
    to_change.reset_index(inplace=True, drop=True)
    to_change.loc[0, 'TrainAccuracy'] = None
    to_change.loc[0, 'TestAccuracy'] = tuned_test_acc
    tuned_df = tuned_df.append(to_change)
    print("OK")
    print("Best model parameters: ", best_params)
    if print_results:
        print("Initial test accuracy: ", maxes_df[maxes_df['Classifier'] == 'LinearSVC']['TestAccuracy'])
        print("Test accuracy after tuning: ", tuned_test_acc)


## Tuning LinearSVC

In [22]:
clf = LinearSVC()
clf.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': True,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'loss': 'squared_hinge',
 'max_iter': 1000,
 'multi_class': 'ovr',
 'penalty': 'l2',
 'random_state': None,
 'tol': 0.0001,
 'verbose': 0}

In [23]:
tuned_parameters = {
    "dual": [True, False],
    "max_iter": [4000],
    "C": [1, 10]
}

tune_model(tuned_parameters, clf.__class__.__name__)


Ok
OK
OK
OK
Best model parameters:  {'dual': True, 'max_iter': 4000, 'C': 1}
Initial test accuracy:  0    0.905931
Name: TestAccuracy, dtype: float64
Test accuracy after tuning:  0.905930826369545


## Tuning MultinomialNB

In [24]:
clf = MultinomialNB()
clf.get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}

In [25]:
tuned_parameters = {
    "alpha": [0.25, 0.5, 1.0, 5.0],
    "fit_prior": [True, False],
    "alpha": [1.0]
}

tune_model(tuned_parameters, clf.__class__.__name__)


Ok
OK
OK
OK
Best model parameters:  {'alpha': 1.0, 'fit_prior': True}
Initial test accuracy:  0    0.905931
Name: TestAccuracy, dtype: float64
Test accuracy after tuning:  0.8862000928505107


## Tuning ComplementNB

In [26]:
clf = ComplementNB()
clf.get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True, 'norm': False}

In [27]:
tuned_parameters['norm'] = [True, False]

tune_model(tuned_parameters, clf.__class__.__name__)

Ok
OK
OK
OK
Best model parameters:  {'alpha': 1.0, 'fit_prior': True, 'norm': False}
Initial test accuracy:  0    0.905931
Name: TestAccuracy, dtype: float64
Test accuracy after tuning:  0.8900301764159703


## Tuning XGBClassifier

In [28]:
clf = XGBClassifier()
clf.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': None,
 'subsample': 1,
 'verbosity': 1}

In [29]:
tuned_parameters = {
    "learning_rate": [0.05, 0.1, 0.15],
    "max_depth": [2, 3, 4],
    "n_estimators": [100, 150],
    "n_jobs": [-1]
}

tune_model(tuned_parameters, clf.__class__.__name__)

Ok
OK
OK
OK
Best model parameters:  {'learning_rate': 0.15, 'max_depth': 4, 'n_estimators': 150, 'n_jobs': -1}
Initial test accuracy:  0    0.905931
Name: TestAccuracy, dtype: float64
Test accuracy after tuning:  0.8185933147632312


## Tuning RandomForestClassifier

In [42]:
clf = RandomForestClassifier()
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [43]:
tuned_parameters = {
    "n_jobs": [-1],
    "max_depth": [None, 2, 3, 4, 5]
}

tune_model(tuned_parameters, clf.__class__.__name__)

Ok
OK
OK
OK
Best model parameters:  {'n_jobs': -1, 'max_depth': None}
Initial test accuracy:  0    0.905931
Name: TestAccuracy, dtype: float64
Test accuracy after tuning:  0.8206244196843082


## Tuning PassiveAggressiveClassifier

In [31]:
clf = PassiveAggressiveClassifier()
clf.get_params()

{'C': 1.0,
 'average': False,
 'class_weight': None,
 'early_stopping': False,
 'fit_intercept': True,
 'loss': 'hinge',
 'max_iter': 1000,
 'n_iter_no_change': 5,
 'n_jobs': None,
 'random_state': None,
 'shuffle': True,
 'tol': 0.001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [32]:
tuned_parameters = {
    "max_iter": [2500],
    "C": [0.5, 1.0, 5.0],
    "n_jobs": [-1]
}

tune_model(tuned_parameters, clf.__class__.__name__)


Ok
OK
OK
OK
Best model parameters:  {'max_iter': 2500, 'C': 0.5, 'n_jobs': -1}
Initial test accuracy:  0    0.905931
Name: TestAccuracy, dtype: float64
Test accuracy after tuning:  0.8997214484679665


## Tuning SGDClassifier

In [33]:
clf = SGDClassifier()
clf.get_params()

{'alpha': 0.0001,
 'average': False,
 'class_weight': None,
 'early_stopping': False,
 'epsilon': 0.1,
 'eta0': 0.0,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'learning_rate': 'optimal',
 'loss': 'hinge',
 'max_iter': 1000,
 'n_iter_no_change': 5,
 'n_jobs': None,
 'penalty': 'l2',
 'power_t': 0.5,
 'random_state': None,
 'shuffle': True,
 'tol': 0.001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [36]:
tuned_parameters = {
    "max_iter": [2500, 4000],
    "n_jobs": [-1],
    "learning_rate": ["optimal", "adaptive"],
    "eta0": [0.1, 0.3]
}
tune_model(tuned_parameters, clf.__class__.__name__)

Ok
OK
OK
OK
Best model parameters:  {'max_iter': 2500, 'n_jobs': -1, 'learning_rate': 'adaptive', 'eta0': 0.1}
Initial test accuracy:  0    0.905931
Name: TestAccuracy, dtype: float64
Test accuracy after tuning:  0.9048282265552461


# Checking the tuned_df and saving the dataframe to a .csv file

In [37]:
tuned_df.sort_values(by=['TestAccuracy'], ascending=False).head(5)


Unnamed: 0.1,Classifier,By,Preprocessed,Vectorizer,Ngram,TopKFeatures,TrainAccuracy,TestAccuracy,Unnamed: 0
0,LinearSVC,full_text,0,TfidfVectorizer,"(1, 1)",7500,,0.905931,0.0
0,SGDClassifier,full_text,0,CountVectorizer,"(1, 1)",7500,,0.904828,0.0
0,PassiveAggressiveClassifier,full_text,0,TfidfVectorizer,"(1, 2)",7500,,0.899721,0.0
0,ComplementNB,full_text,0,TfidfVectorizer,"(1, 1)",7500,,0.89003,0.0
0,MultinomialNB,full_text,0,CountVectorizer,"(1, 1)",7500,,0.8862,0.0


In [44]:
tuned_df.to_csv("results_for_top_5categories/TunedResultsforTop5categories.csv")