In [1]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report,  roc_auc_score
from sklearn.model_selection import KFold, cross_val_score, train_test_split
import collections

In [2]:
# Import text data
raw_training = pd.read_csv("training.csv")
raw_testing = pd.read_csv("test.csv")

# testing y
y_valid = raw_testing["topic"]

# Create bag of words
count = CountVectorizer()
bag_of_words = count.fit_transform(raw_training["article_words"])

# Create feature matrix
X = bag_of_words

# Create bag of words
y = raw_training["topic"]

#transform testing data
X_valid = count.transform(raw_testing["article_words"])

#######################Resampling Dataset#######################


# Reducing the effect imbalnced by deleting some irrelevant class
# "Irrelevant" classe has 4734 samples in the training data, try to reduce it into 2000
irrelevant = raw_training[raw_training["topic"] == "IRRELEVANT"]
remove_n = 2734
drop_indices = np.random.choice(irrelevant.index, remove_n, replace=False)
irrelevant = irrelevant.drop(drop_indices)

reduce_training =  pd.concat([raw_training[raw_training["topic"] != "IRRELEVANT"], irrelevant],ignore_index=True)
reduce_bag_of_words = count.fit_transform(reduce_training["article_words"])
R_X = reduce_bag_of_words
R_y = reduce_training["topic"]
#transform testing data
R_X_valid = count.transform(raw_testing["article_words"])

# Icreasing the minor classes
# Increasing 
topic_class = raw_training[raw_training["topic"] != "IRRELEVANT"]
increase_training = pd.concat([topic_class, topic_class, topic_class, raw_training[raw_training["topic"] == "IRRELEVANT"]], ignore_index=True)
increase_bag_of_words = count.fit_transform(increase_training["article_words"])
I_X = increase_bag_of_words
I_y = increase_training["topic"]
#transform testing data
I_X_valid = count.transform(raw_testing["article_words"])

# Considering stop words
count2 = CountVectorizer(stop_words='english')
sw_bag_of_words = count.fit_transform(increase_training["article_words"])
s_X = sw_bag_of_words
s_y = increase_training["topic"]
#transform testing data
s_X_valid = count.transform(raw_testing["article_words"])

In [3]:
# Using specific model "method", return specifc score "score" by cross validation

def Model_Score (X, y, validation_X, method):
   
    clf = method
    clf.fit(X, y)
    
    y_predicted = clf.predict(validation_X)
    accuracy = accuracy_score(y_valid, y_predicted)
    
    precision = precision_score(y_valid, y_predicted, average = "macro")
    recall = recall_score(y_valid, y_predicted, average = "macro")
    f1 = f1_score(y_valid, y_predicted, average = "macro")
    
    return accuracy, precision, recall, f1

In [4]:
raw_accuracy, raw_precision, raw_recall, raw_f1 = Model_Score(X, y, X_valid, RandomForestClassifier())

# reduce irrelevant samples
R_accuracy, R_precision, R_recall, R_f1 = Model_Score(R_X, R_y, R_X_valid, RandomForestClassifier())

# Icreasing the minor classes
I_accuracy, I_precision, I_recall, I_f1 = Model_Score(I_X, I_y, I_X_valid, RandomForestClassifier())

# Try considering stop words
s_accuracy, s_precision, s_recall, s_f1 = Model_Score(s_X, s_y, s_X_valid, RandomForestClassifier())

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
print("====================================================================================")

print("Random Forest using raw data:\naccuracy:  " + str(raw_accuracy) +
     "\nprecision: " + str(raw_precision) + "\nrecall:    " + str(raw_recall) + "\nf1:        " +
     str(raw_f1))
print("====================================================================================")

print("Random Forest with irrelevant classes:\naccuracy:  " + str(R_accuracy) +
     "\nprecision: " + str(R_precision) + "\nrecall:    " + str(R_recall) + "\nf1:        " +
     str(R_f1))
print("====================================================================================")

print("Random Forest with increasing minor classes:\naccuracy:  " + str(I_accuracy) +
     "\nprecision: " + str(I_precision) + "\nrecall:    " + str(I_recall) + "\nf1:        " +
     str(I_f1))
print("====================================================================================")

print("Random Forest using minorcalsses with stop words:\naccuracy:  " + str(s_accuracy) +
     "\nprecision: " + str(s_precision) + "\nrecall:    " + str(s_recall) + "\nf1:        " +
     str(s_f1))

Random Forest using raw data:
accuracy:  0.724
precision: 0.34539439285202
recall:    0.25453862499057006
f1:        0.25223932541715227
Random Forest with irrelevant classes:
accuracy:  0.734
precision: 0.46186893620303626
recall:    0.3267942869487492
f1:        0.3416428021345668
Random Forest with increasing minor classes:
accuracy:  0.724
precision: 0.5188252959752111
recall:    0.33501817595067024
f1:        0.36678533344653635
Random Forest using minorcalsses with stop words:
accuracy:  0.74
precision: 0.5801975390106834
recall:    0.34417857123806783
f1:        0.3842118357212598
