<a href="https://colab.research.google.com/github/romapavelko01/NLP_SDLC_project/blob/classifications/RandomForest_for_all.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Mounted at /content/drive


In [31]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [21]:
filename = "/content/drive/MyDrive/SDLC/news_analysis_project/data/final_news_category_dataset.json"
df = pd.read_json(filename, orient='split')
df.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


Cutting the dataframe to only have rows from the top 10 most common categories

In [22]:
# df = df[df.category.isin(df['category'].value_counts().index[:10])]
# print(df.category.value_counts())
# df

# Text preprocessing

In [12]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [1]:
import re
import string


def cleaning_function(sentence):
    """
    Function needed to perform data preprocessing: removing punctuation symbols,
    stop_words and other random things in order to obtain clean text
    """
    # the following line removes numbers from text
    result = re.sub(r'\d+', '', sentence.lower())

    # the following line removes any punctuation from the text
    result = result.translate(str.maketrans('','',string.punctuation))
    return [word for word in result.split() if not word in stop_words]

In [23]:
df['processed_description'] = df['short_description'].apply(lambda x: ' '.join(cleaning_function(x)))
df['processed_headline'] = df['headline'].apply(lambda x: ' '.join(cleaning_function(x)))
df['full_text'] = df['headline'] + df['short_description']
df['processed_full_text'] = df['processed_headline'] + df['processed_description']

# Splitting the data into train/test with equal distribution of labels in both sets

In [24]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'category'], df['category'], 
                                                    test_size=0.2, stratify=df.category,
                                                    random_state=1)

processed_split_description = (X_train['processed_description'], X_test['processed_description'], y_train, y_test)
processed_split_headline = (X_train['processed_headline'], X_test['processed_headline'], y_train, y_test)
processed_split_full_text = (X_train['processed_full_text'], X_test['processed_full_text'], y_train, y_test)

raw_split_description = (X_train['short_description'], X_test['short_description'], y_train, y_test)
raw_split_headline = (X_train['headline'], X_test['headline'], y_train, y_test)
raw_split_full_text = (X_train['full_text'], X_test['full_text'], y_train, y_test)

# Creating a results table to store accuracies of combinations of vectorizer parameters for ComplementNB and defining a function to record data to the table

In [32]:
results_df = pd.DataFrame(columns=['Classifier', 'By', 'Preprocessed', 'Vectorizer', 'Ngram', 'TopKFeatures', 'TrainAccuracy', 'TestAccuracy'])

def write_df(dataset, preprocessed, by, clf, vect, ngram=(1, 1), topk=6000, display=True):
    """
    Function to calculate accuracies for the given dataset split, 
    whether it is raw or preprocessed; 
    by - in ['headline', 'descirption', 'full_text']
    classifier, 
    vectorizer, ngram parameter, and how many top features to consider;
    record calculated accuracies to the results_df;
    and, once display is set to True, print classification accuracies
    """
    global results_df

    X_train, X_test, y_train, y_test = dataset

    vectorizer = vect(ngram_range=ngram)
    x_train_ = vectorizer.fit_transform(X_train)

    # Vectorize validation texts.
    x_val = vectorizer.transform(X_test)
    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(topk, x_train_.shape[1]))
    selector.fit(x_train_, y_train)
    x_train = selector.transform(x_train_).astype('float32')
    x_val = selector.transform(x_val).astype('float32')   

    clf = clf(n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred_test = clf.predict(x_val)
    y_pred_train = clf.predict(x_train)
    train_acc, test_acc = accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred_test)
    if display:
        print(f"Train classification accuracy = {train_acc},\n Test classification accuracy = {test_acc}")

    results_df = results_df.append(pd.DataFrame({
        'Classifier': [clf.__class__.__name__],
        'By': [by],
        'Preprocessed': [preprocessed],
        'Vectorizer': [vectorizer.__class__.__name__],
        'Ngram': [ngram],
        'TopKFeatures': [topk],
        'TrainAccuracy': [train_acc],
        'TestAccuracy': [test_acc]
    }), ignore_index=True)

In [30]:
TOP_KS = np.arange(9000, 12001, 1500)
TOP_KS

array([ 9000, 10500, 12000])

# Classifying on full text

## CountVectorizer on Unigrams

In [33]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(processed_split_full_text, 1, 'full_text', RandomForestClassifier, CountVectorizer, topk=i)
    print('\n')

For top 9000 features selected:

Train classification accuracy = 0.9969193811378997,
 Test classification accuracy = 0.49199671404744716


For top 10500 features selected:

Train classification accuracy = 0.9971869904531933,
 Test classification accuracy = 0.49386373254337707


For top 12000 features selected:

Train classification accuracy = 0.9977782203358185,
 Test classification accuracy = 0.4959298996788728




## TfidfVectorizer on Unigrams

In [34]:
for i in (TOP_KS):
    print(f"For top {i} features selected:\n")
    write_df(processed_split_full_text, 1, 'full_text', RandomForestClassifier, TfidfVectorizer, topk=i)
    print('\n')

For top 9000 features selected:

Train classification accuracy = 0.9984876961949689,
 Test classification accuracy = 0.5094222200094596


For top 10500 features selected:

Train classification accuracy = 0.9985499309194559,
 Test classification accuracy = 0.5107166861666377


For top 12000 features selected:

Train classification accuracy = 0.998786422872506,
 Test classification accuracy = 0.5129571083617535




# Showing top 5 results by test accuracies and saving the results to csv file

In [36]:
results_df.sort_values(by='TestAccuracy', ascending=False).head(5)

Unnamed: 0,Classifier,By,Preprocessed,Vectorizer,Ngram,TopKFeatures,TrainAccuracy,TestAccuracy
5,RandomForestClassifier,full_text,1,TfidfVectorizer,"(1, 1)",12000,0.998786,0.512957
4,RandomForestClassifier,full_text,1,TfidfVectorizer,"(1, 1)",10500,0.99855,0.510717
3,RandomForestClassifier,full_text,1,TfidfVectorizer,"(1, 1)",9000,0.998488,0.509422
2,RandomForestClassifier,full_text,1,CountVectorizer,"(1, 1)",12000,0.997778,0.49593
1,RandomForestClassifier,full_text,1,CountVectorizer,"(1, 1)",10500,0.997187,0.493864


In [37]:
results_df.to_csv('/content/drive/MyDrive/SDLC/news_analysis_project/resultsRandomForest.csv')