In [1]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report,  roc_auc_score
from sklearn.model_selection import KFold, cross_val_score, train_test_split
import collections

In [2]:
# Import text data
raw_training = pd.read_csv("training.csv")
raw_testing = pd.read_csv("test.csv")

# Create bag of words
count = CountVectorizer()
bag_of_words = count.fit_transform(raw_training["article_words"])

# Create feature matrix
X = bag_of_words

# Create bag of words
y = raw_training["topic"]

#######################Resampling Dataset#######################


# Reducing the effect imbalnced by deleting some irrelevant class
# "Irrelevant" classe has 4734 samples in the training data, try to reduce it into 2000
irrelevant = raw_training[raw_training["topic"] == "IRRELEVANT"]
remove_n = 2734
drop_indices = np.random.choice(irrelevant.index, remove_n, replace=False)
irrelevant = irrelevant.drop(drop_indices)

reduce_training =  pd.concat([raw_training[raw_training["topic"] != "IRRELEVANT"], irrelevant],ignore_index=True)
reduce_bag_of_words = count.fit_transform(reduce_training["article_words"])
R_X = reduce_bag_of_words
R_y = reduce_training["topic"]

# Icreasing the minor classes
# Increasing 
topic_class = raw_training[raw_training["topic"] != "IRRELEVANT"]
increase_training = pd.concat([topic_class, topic_class, topic_class, raw_training[raw_training["topic"] == "IRRELEVANT"]], ignore_index=True)
increase_bag_of_words = count.fit_transform(increase_training["article_words"])
I_X = increase_bag_of_words
I_y = increase_training["topic"]

# Considering stop words
count2 = CountVectorizer(stop_words='english')
sw_bag_of_words = count.fit_transform(increase_training["article_words"])
s_X = sw_bag_of_words
s_y = increase_training["topic"]

In [15]:
# Using specific model "method", return specifc score "score" by cross validation

def Model_Score (X, y, method):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)
    clf = method
    clf.fit(X_train, y_train)
    y_predicted = clf.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_predicted)
    precision = precision_score(y_valid, y_predicted, average = "macro")
    recall = recall_score(y_valid, y_predicted, average = "macro")
    f1 = f1_score(y_valid, y_predicted, average = "macro")
    
    return accuracy, precision, recall, f1

In [16]:
raw_accuracy, raw_precision, raw_recall, raw_f1 = Model_Score(X, y, DecisionTreeClassifier())

# reduce irrelevant samples
R_accuracy, R_precision, R_recall, R_f1 = Model_Score(R_X, R_y, DecisionTreeClassifier())

# Icreasing the minor classes
I_accuracy, I_precision, I_recall, I_f1 = Model_Score(I_X, I_y, DecisionTreeClassifier())

# Try considering stop words
s_accuracy, s_precision, s_recall, s_f1 = Model_Score(s_X, s_y, DecisionTreeClassifier())

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(s_X,s_y, test_size=0.20, random_state=42)
clf = DecisionTreeClassifier()

clf = clf.fit(X_train,y_train)

y_predicted = clf.predict(X_valid)

#print("Accuracy:",accuracy_score(y_valid, y_predicted))
#print("Precision:",precision_score(y_valid, y_predicted, average = "macro"))
#print("Recall:",recall_score(y_valid, y_predicted, average = "macro"))
#print("f1:",f1_score(y_valid, y_predicted, average = "macro"))

In [17]:
print("====================================================================================")

print("Stop words,\naccuracy:  " + str(raw_accuracy) +
     "\nprecision: " + str(raw_precision) + "\nrecall:    " + str(raw_recall) + "\nf1:        " +
     str(raw_f1))
print("====================================================================================")

print("Irrelevant classes\naccuracy:  " + str(I_accuracy) +
     "\nprecision: " + str(I_precision) + "\nrecall:    " + str(I_recall) + "\nf1:        " +
     str(I_f1))
print("====================================================================================")

print("Decision Tree increasing minor classes:\naccuracy:  " + str(s_accuracy) +
     "\nprecision: " + str(s_precision) + "\nrecall:    " + str(s_recall) + "\nf1:        " +
     str(s_f1))

print("====================================================================================")

print("Decision Tree stop words\naccuracy:  " + str(R_accuracy) +
     "\nprecision: " + str(R_precision) + "\nrecall:    " + str(R_recall) + "\nf1:        " +
     str(R_f1))

Stop words,
accuracy:  0.6873684210526316
precision: 0.551549234607873
recall:    0.4741841228128939
f1:        0.5024722898748023
Stop words,
accuracy:  0.9088521145258734
precision: 0.8858775280000011
recall:    0.9457035831387229
f1:        0.9123884822667834
Stop words,
accuracy:  0.9075387444181771
precision: 0.8826604461879402
recall:    0.9465426648465818
f1:        0.9108393864401922
Stop words,
accuracy:  0.6410635155096012
precision: 0.5087227465873748
recall:    0.5046252003025178
f1:        0.5032617862469646


In [None]:
len(y_valid)