In [1]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report,  roc_auc_score
from sklearn.model_selection import KFold, cross_val_score, train_test_split
import collections

In [2]:
# Import text data
raw_training = pd.read_csv("training.csv")
raw_testing = pd.read_csv("test.csv")

# Create bag of words
#count = CountVectorizer()
#X_valid = count.fit_transform(raw_testing["article_words"])

# testing y
y_valid = raw_testing["topic"]

# Create bag of words
count = CountVectorizer()
bag_of_words = count.fit_transform(raw_training["article_words"])
X = bag_of_words
#y training set
y = raw_training["topic"]

#transform testing data
X_valid = count.transform(raw_testing["article_words"])

#######################Resampling Dataset#######################


# Reducing the effect imbalnced by deleting some irrelevant class
# "Irrelevant" classe has 4734 samples in the training data, try to reduce it into 2000
irrelevant = raw_training[raw_training["topic"] == "IRRELEVANT"]
remove_n = 2734
drop_indices = np.random.choice(irrelevant.index, remove_n, replace=False)
irrelevant = irrelevant.drop(drop_indices)

reduce_training =  pd.concat([raw_training[raw_training["topic"] != "IRRELEVANT"], irrelevant],ignore_index=True)
reduce_bag_of_words = count.fit_transform(reduce_training["article_words"])
R_X = reduce_bag_of_words
R_y = reduce_training["topic"]
#transform testing data
R_X_valid = count.transform(raw_testing["article_words"])

# Icreasing the minor classes
# Increasing 
topic_class = raw_training[raw_training["topic"] != "IRRELEVANT"]
increase_training = pd.concat([topic_class, topic_class, topic_class, raw_training[raw_training["topic"] == "IRRELEVANT"]], ignore_index=True)
increase_bag_of_words = count.fit_transform(increase_training["article_words"])
I_X = increase_bag_of_words
I_y = increase_training["topic"]
#transform testing data
I_X_valid = count.transform(raw_testing["article_words"])

# Considering stop words
count2 = CountVectorizer(stop_words='english')
sw_bag_of_words = count.fit_transform(increase_training["article_words"])
s_X = sw_bag_of_words
s_y = increase_training["topic"]
#transform testing data
s_X_valid = count.transform(raw_testing["article_words"])

In [3]:
# Using specific model "method", return specifc score "score" by cross validation

def Model_Score (X, y, validation_X, method):
   
    clf = method
    clf.fit(X, y)
    y_predicted = clf.predict(validation_X)
    
    accuracy = accuracy_score(y_valid, y_predicted)
    
    precision = precision_score(y_valid, y_predicted, average = "macro")
    recall = recall_score(y_valid, y_predicted, average = "macro")
    f1 = f1_score(y_valid, y_predicted, average = "macro")
    
    return accuracy, precision, recall, f1, clf

In [4]:
#raw data
raw_accuracy, raw_precision, raw_recall, raw_f1, raw_clf = Model_Score(X, y, X_valid, DecisionTreeClassifier())

# reduce irrelevant samples
R_accuracy, R_precision, R_recall, R_f1, R_clf = Model_Score(R_X, R_y, R_X_valid, DecisionTreeClassifier())

# Icreasing the minor classes
I_accuracy, I_precision, I_recall, I_f1, I_clf = Model_Score(I_X, I_y, I_X_valid, DecisionTreeClassifier())

# Try considering stop words
s_accuracy, s_precision, s_recall, s_f1, s_clf = Model_Score(s_X, s_y, I_X_valid, DecisionTreeClassifier())

In [5]:
print("====================================================================================")

print("Decsision Tree using raw data:\naccuracy:  " + str(raw_accuracy) +
     "\nprecision: " + str(raw_precision) + "\nrecall:    " + str(raw_recall) + "\nf1:        " +
     str(raw_f1))
print("====================================================================================")

print("Decision Tree irrelevant classes:\naccuracy:  " + str(R_accuracy) +
     "\nprecision: " + str(R_precision) + "\nrecall:    " + str(R_recall) + "\nf1:        " +
     str(R_f1))
print("====================================================================================")

print("Decision Tree increasing minor classes:\naccuracy:  " + str(I_accuracy) +
     "\nprecision: " + str(I_precision) + "\nrecall:    " + str(I_recall) + "\nf1:        " +
     str(I_f1))
print("====================================================================================")

print("Decision Tree using minor classes with stop words:\naccuracy:  " + str(s_accuracy) +
     "\nprecision: " + str(s_precision) + "\nrecall:    " + str(s_recall) + "\nf1:        " +
     str(s_f1))
print("====================================================================================")

Decsision Tree using raw data:
accuracy:  0.724
precision: 0.5422977915506378
recall:    0.5555469797174604
f1:        0.5253106096020288
Decision Tree irrelevant classes:
accuracy:  0.656
precision: 0.44052385756425905
recall:    0.5227159758624289
f1:        0.45697979934275923
Decision Tree increasing minor classes:
accuracy:  0.678
precision: 0.4538188280956876
recall:    0.4951380522604779
f1:        0.44482902224883564
Decision Tree using minor classes with stop words:
accuracy:  0.702
precision: 0.44342772894684257
recall:    0.45354999691384135
f1:        0.43752893804241677


In [7]:
import operator

def TopTen (Model, X_test, y_test):
    
    predict_y = Model.predict(X_test)    #get the prediction
    proba_y = Model.predict_proba(X_test)  #get the probablity of each class
    recommendation = dict()
    
    for c in Model.classes_:
        recommendation[c] = dict()
        
    for i in range(len(proba_y)):
        pred = predict_y[i]         # get the predict for this sample
        proba = np.max(proba_y[i])  # get how much probability for this prediction
        recommendation[pred][i + 9501] = proba  #save article number and probability into dict

    for key in recommendation:
        d = recommendation[key]
        sorted_d = dict(sorted(d.items(), key=operator.itemgetter(1),reverse=True))
        if (key == "IRRELEVANT"):   # skip irrelevant class
            continue
        print(key)
        count = 0
        for k in sorted_d:
            print("[" + str(count+1)+ "]  " + str(k) + "  " + str(sorted_d[k]))
            count += 1
            if count == 10:         # 10 articles been recommended
                break
TopTen(raw_clf, X, y)

ARTS CULTURE ENTERTAINMENT
[1]  9655  1.0
[2]  9696  1.0
[3]  9799  1.0
[4]  10014  1.0
[5]  10063  1.0
[6]  10197  1.0
[7]  10212  1.0
[8]  10316  1.0
[9]  10533  1.0
[10]  10598  1.0
BIOGRAPHIES PERSONALITIES PEOPLE
[1]  9510  1.0
[2]  9526  1.0
[3]  9550  1.0
[4]  9682  1.0
[5]  9722  1.0
[6]  9780  1.0
[7]  9790  1.0
[8]  9813  1.0
[9]  9861  1.0
[10]  9863  1.0
DEFENCE
[1]  9542  1.0
[2]  9637  1.0
[3]  9660  1.0
[4]  9684  1.0
[5]  9734  1.0
[6]  9776  1.0
[7]  9784  1.0
[8]  9798  1.0
[9]  9810  1.0
[10]  9828  1.0
DOMESTIC MARKETS
[1]  9536  1.0
[2]  9569  1.0
[3]  9630  1.0
[4]  9642  1.0
[5]  9745  1.0
[6]  9753  1.0
[7]  9935  1.0
[8]  9957  1.0
[9]  9959  1.0
[10]  9969  1.0
FOREX MARKETS
[1]  9501  1.0
[2]  9504  1.0
[3]  9511  1.0
[4]  9514  1.0
[5]  9521  1.0
[6]  9540  1.0
[7]  9564  1.0
[8]  9577  1.0
[9]  9596  1.0
[10]  9623  1.0
HEALTH
[1]  9571  1.0
[2]  9634  1.0
[3]  9644  1.0
[4]  9650  1.0
[5]  9681  1.0
[6]  9683  1.0
[7]  9691  1.0
[8]  9709  1.0
[9]  9714  1