Importing modules

In [97]:
import pandas as pd
import os
import numpy as np

In [21]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
os.chdir('/content/drive/MyDrive/SDLC/news_analysis_project')

Some helper functions

In [5]:
def map_grams(tup):
    return "Unigrams" if tup == (1, 1) else "Bigrams" if tup == (1, 2) else "Unigrams and Bigrams"

def map_prepr(bin_val):
    return "Preprocessed" if bin_val else "Raw"


# Collecting data 

Since I have analyzed the performance of models both on a full dataset - with all categories, and a smaller one - with only top 5 most frequent categories

## Helper function 

This is the function implemented to iterate over .csv files with results and create 2 dataframes - one with only one data row per classifier (top 1 by accuracy) and ther other with 5 data rows by accuracy per classifier

In [92]:
def create_dfs(tuned=False, top5=False):
    """
    Parameters:
    tuned: bool, identifies on what results data should be collected, whether on tuned or not;
    top5: bool, identifies on what results data should be collected, whether top 5 or all categories
    Returns:
    tuple - (pd.DataFrame, pd.DataFrame) - (one_row_per_clf_df, (up to 50)_rows_per_clf_df)
    """
    columns_to_keep = ["Classifier",	"By", "Preprocessed",	"Vectorizer",	"Ngram",	"TopKFeatures",	"TrainAccuracy",	"TestAccuracy"]
    if not tuned:
        maxes_df = pd.DataFrame(columns=['Classifier', 'By', 'Preprocessed', 
                                  'Vectorizer', 'Ngram', 'TopKFeatures', 
                                  'TrainAccuracy', 'TestAccuracy'])
        df_50 = maxes_df.copy()
        dir = os.getcwd() + ("/results_for_top_5categories" if top5 else "results_for_all")
        for filename in os.listdir(dir):
            if '.csv' in filename:
                current_df = pd.read_csv(dir + '/' + filename).sort_values(by='TestAccuracy', ascending=False).iloc[:50]
                current_df.drop(columns=['Unnamed: 0'], inplace=True)
                df_50 = df_50.append(current_df.iloc[:], ignore_index=True)
                maxes_df = maxes_df.append(current_df.iloc[0], ignore_index=True)

        return maxes_df[columns_to_keep], df_50[columns_to_keep]
    else:
        dir = "results_after_tuning"
        dfs = []
        for i in os.listdir(dir):
            dfs.append(pd.read_csv(f"{dir}/{i}"))
        return dfs[0]



## Dataframes for all categories

In [7]:
maxes_all, top_50_all = create_dfs()

In [8]:
maxes_all.sort_values(by='TestAccuracy', ascending=False).head()

Unnamed: 0,Classifier,By,Preprocessed,Vectorizer,Ngram,TopKFeatures,TrainAccuracy,TestAccuracy
2,LinearSVC,full_text,1,TfidfVectorizer,"(1, 1)",12000,0.687725,0.58587
0,MultinomialNB,full_text,0,CountVectorizer,"(1, 1)",10000,0.629797,0.573946
1,ComplementNB,full_text,0,TfidfVectorizer,"(1, 2)",14000,0.591883,0.563068
6,SGDClassifier,full_text,1,CountVectorizer,"(1, 1)",12000,0.634987,0.561524
3,PassiveAggressiveClassifier,full_text,1,TfidfVectorizer,"(1, 1)",12000,0.696077,0.557591


## Dataframes for top 5 categories

In [9]:
maxes_5, top_50_5 = create_dfs(top5=True)

In [10]:
maxes_5 = maxes_5.dropna().sort_values(by='TestAccuracy', ascending=False)
maxes_5

Unnamed: 0,Classifier,By,Preprocessed,Vectorizer,Ngram,TopKFeatures,TrainAccuracy,TestAccuracy
0,LinearSVC,full_text,0,TfidfVectorizer,"(1, 1)",7500,0.946625,0.905931
2,SGDClassifier,full_text,0,CountVectorizer,"(1, 1)",7500,0.938632,0.901578
1,PassiveAggressiveClassifier,full_text,0,TfidfVectorizer,"(1, 2)",7500,0.949382,0.896936
6,ComplementNB,full_text,0,TfidfVectorizer,"(1, 1)",7500,0.903842,0.89003
5,MultinomialNB,full_text,0,CountVectorizer,"(1, 1)",7500,0.904277,0.8862
4,RandomForestClassifier,headline,1,TfidfVectorizer,"(1, 1)",7500,0.994922,0.820044
3,XGBClassifier,full_text,0,CountVectorizer,"(1, 2)",4500,0.75016,0.745532


# Bar plot representing best combination of vectorizer, ngram, topfeatures etc. for each ML model type

## Helper function

In [96]:
def plot_unique_clf(data):
    data = data.sort_values(by='TestAccuracy', ascending=False)
    fig = go.Figure(go.Bar(
        x=data.Classifier.values,
        y=data.TestAccuracy.values,
        showlegend = False))

    fig.update_layout(
        title = "The best performance (test accuracy) for each ML model type")
    
    values_range = data.TestAccuracy.quantile([0, 1]).values
    fig.update_yaxes(range=[values_range[0] - 0.05, values_range[1]])

    fig.show()

## Building barplot for results after classification on top 5 categories

In [12]:
plot_unique_clf(maxes_5)  # peak_performance_top_5.png

## Building barplot for results after classification on all categories

In [13]:
plot_unique_clf(maxes_all)  # peak_performance_all.png

# Scatter plots

## Accuracy vs what type of text data classification was made on and what vectorizer&ngram_range was used

### Helper function to build complex figure

In [98]:
def build_all(dataframe):
    """
    Visual representation of how different vectorizer/ngram_range/
    topKfeatures/what classification was based on combination affects each
    of the classifier's performance.
    """
    dataframe["FullBy"] = dataframe["Preprocessed"].apply(map_prepr) + " " + dataframe["By"]
    dataframe["NgramRange"] = dataframe["Ngram"].apply(map_grams)
    big_fig = px.scatter(
      dataframe, x="TopKFeatures", y="TestAccuracy", 
      color="FullBy", symbol="NgramRange", 
      facet_row="Classifier", facet_col="Vectorizer", facet_row_spacing=0.05)
    big_fig.update_layout(
        autosize=False,
        width=1100,
        height=1450)
    big_fig.show()

### Performance of classifiers trained on data with all categories

In [99]:
build_all(top_50_all)  # performance_for_all.png

### Performance after training on data with top 5 categories

In [100]:
build_all(top_50_5)  # performance_for_top_5.png

# Exploring performance of tuned models (with parameters, fitted to the models trained only on top 5 most frequent categories)




## Collecting dataframes

In [93]:
tuned_df_all = create_dfs(tuned=True)

In [94]:
tuned_df_all

Unnamed: 0.1,Unnamed: 0,Classifier,By,Preprocessed,Vectorizer,Ngram,TopKFeatures,Best_parameters,TestAccuracy
0,1,MultinomialNB,full_text,0,CountVectorizer,"(1, 1)",7500,"{'alpha': 1.0, 'fit_prior': True}",0.56267
1,0,LinearSVC,full_text,0,TfidfVectorizer,"(1, 1)",7500,"{'dual': True, 'max_iter': 4000, 'C': 1}",0.60235
2,2,ComplementNB,full_text,0,TfidfVectorizer,"(1, 1)",7500,"{'alpha': 1.0, 'fit_prior': True, 'norm': False}",0.553783
3,3,XGBClassifier,full_text,0,CountVectorizer,"(1, 2)",4500,"{'learning_rate': 0.15, 'max_depth': 4, 'n_est...",0.539792
4,6,RandomForestClassifier,headline,1,TfidfVectorizer,"(1, 1)",7500,"{'n_jobs': -1, 'max_depth': None}",0.482861
5,4,PassiveAggressiveClassifier,full_text,0,TfidfVectorizer,"(1, 2)",7500,"{'max_iter': 2500, 'C': 0.5, 'n_jobs': -1}",0.579597
6,5,SGDClassifier,full_text,0,CountVectorizer,"(1, 1)",7500,"{'max_iter': 2500, 'n_jobs': -1, 'learning_rat...",0.588708


## Checking the peak performance of tuned classifiers

### On all categories

In [95]:
plot_unique_clf(tuned_df_all)  # max_tuned_performance_all