In [1]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report,  roc_auc_score
from sklearn.model_selection import KFold, cross_val_score, train_test_split
import collections

In [8]:
# Import text data
raw_training = pd.read_csv("training.csv")
raw_testing = pd.read_csv("test.csv")

# testing y
y_valid = raw_testing["topic"]

# Create bag of words
count = CountVectorizer()
bag_of_words = count.fit_transform(raw_training["article_words"])

# Create feature matrix
X = bag_of_words

# Create bag of words
y = raw_training["topic"]

#transform testing data
X_valid = count.transform(raw_testing["article_words"])

#######################Resampling Dataset#######################


# Reducing the effect imbalnced by deleting some irrelevant class
# "Irrelevant" classe has 4734 samples in the training data, try to reduce it into 2000
irrelevant = raw_training[raw_training["topic"] == "IRRELEVANT"]
remove_n = 2734
drop_indices = np.random.choice(irrelevant.index, remove_n, replace=False)
irrelevant = irrelevant.drop(drop_indices)

reduce_training =  pd.concat([raw_training[raw_training["topic"] != "IRRELEVANT"], irrelevant],ignore_index=True)
reduce_bag_of_words = count.fit_transform(reduce_training["article_words"])
R_X = reduce_bag_of_words
R_y = reduce_training["topic"]
#transform testing data
R_X_valid = count.transform(raw_testing["article_words"])

# Icreasing the minor classes
# Increasing 
topic_class = raw_training[raw_training["topic"] != "IRRELEVANT"]
increase_training = pd.concat([topic_class, topic_class, topic_class, raw_training[raw_training["topic"] == "IRRELEVANT"]], ignore_index=True)
increase_bag_of_words = count.fit_transform(increase_training["article_words"])
I_X = increase_bag_of_words
I_y = increase_training["topic"]
#transform testing data
I_X_valid = count.transform(raw_testing["article_words"])

# Considering stop words
count2 = CountVectorizer(stop_words='english')
sw_bag_of_words = count.fit_transform(increase_training["article_words"])
s_X = sw_bag_of_words
s_y = increase_training["topic"]
#transform testing data
s_X_valid = count.transform(raw_testing["article_words"])

In [3]:
# Using specific model "method", return specifc score "score" by cross validation

def Model_Score(X_train, y_train, val_X, val_y, method):
   
    clf = method
    clf.fit(X_train, y_train)
    y_predicted = clf.predict(val_X)
    
    accuracy = accuracy_score(val_y, y_predicted)
    precision = precision_score(val_y, y_predicted, average = "macro")
    recall = recall_score(val_y, y_predicted, average = "macro")
    f1 = f1_score(val_y, y_predicted, average = "macro")
    
    return accuracy, precision, recall, f1


def Model_Score_CrossVal (X, y, method, k=5):
    
    clf = method    
    accuracy_scores = cross_val_score(clf, X, y, cv=k, scoring="accuracy")
    precision_scores = cross_val_score(clf, X, y, cv=k, scoring="precision_macro")
    recall_scores = cross_val_score(clf, X, y, cv=k, scoring="recall_macro")
    f1_scores = cross_val_score(clf, X, y, cv=k, scoring="f1_macro")
    
    return np.mean(accuracy_scores), np.mean(precision_scores), np.mean(recall_scores), np.mean(f1_scores)


# RandomForestClassifier

In [None]:
raw_accuracy, raw_precision, raw_recall, raw_f1 = Model_Score_CrossVal(X, y, RandomForestClassifier(), 10)

# reduce irrelevant samples
R_accuracy, R_precision, R_recall, R_f1 = Model_Score_CrossVal(R_X, R_y, RandomForestClassifier(), 10)

# Icreasing the minor classes
I_accuracy, I_precision, I_recall, I_f1 = Model_Score_CrossVal(I_X, I_y, RandomForestClassifier(), 10)

# Try considering stop words
s_accuracy, s_precision, s_recall, s_f1, = Model_Score_CrossVal(s_X, s_y, RandomForestClassifier(), 10)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
print("====================================================================================")

print("Random Forest using raw data:\naccuracy:  " + str(raw_accuracy) +
     "\nprecision: " + str(raw_precision) + "\nrecall:    " + str(raw_recall) + "\nf1:        " +
     str(raw_f1))
print("====================================================================================")

print("Random Forest with irrelevant classes:\naccuracy:  " + str(R_accuracy) +
     "\nprecision: " + str(R_precision) + "\nrecall:    " + str(R_recall) + "\nf1:        " +
     str(R_f1))
print("====================================================================================")

print("Random Forest with increasing minor classes:\naccuracy:  " + str(I_accuracy) +
     "\nprecision: " + str(I_precision) + "\nrecall:    " + str(I_recall) + "\nf1:        " +
     str(I_f1))
print("====================================================================================")

print("Random Forest using minorcalsses with stop words:\naccuracy:  " + str(s_accuracy) +
     "\nprecision: " + str(s_precision) + "\nrecall:    " + str(s_recall) + "\nf1:        " +
     str(s_f1))

Random Forest using raw data:
accuracy:  0.72
precision: 0.42857675689250896
recall:    0.28033322512727543
f1:        0.29119956092431337
Random Forest with irrelevant classes:
accuracy:  0.73
precision: 0.49438858851797013
recall:    0.34153979125375006
f1:        0.3611474085077601
Random Forest with increasing minor classes:
accuracy:  0.74
precision: 0.5310282796324548
recall:    0.32428979296828264
f1:        0.3506697594087641
Random Forest using minorcalsses with stop words:
accuracy:  0.706
precision: 0.5013522578672962
recall:    0.29626161405337603
f1:        0.318101543232486


# SVC (Support Vector Classifier)

In [6]:
raw_accuracy, raw_precision, raw_recall, raw_f1 = Model_Score(X, y, X_valid, y_valid, SVC())

# reduce irrelevant samples
R_accuracy, R_precision, R_recall, R_f1 = Model_Score(R_X, R_y, R_X_valid, y_valid, SVC())

# Icreasing the minor classes
I_accuracy, I_precision, I_recall, I_f1 = Model_Score(I_X, I_y, I_X_valid, y_valid, SVC())

# Try considering stop words
s_accuracy, s_precision, s_recall, s_f1 = Model_Score(s_X, s_y, s_X_valid, y_valid, SVC())

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
print("====================================================================================")

print("SVM using raw data:\naccuracy:  " + str(raw_accuracy) +
     "\nprecision: " + str(raw_precision) + "\nrecall:    " + str(raw_recall) + "\nf1:        " +
     str(raw_f1))
print("====================================================================================")

print("SVM with irrelevant classes:\naccuracy:  " + str(R_accuracy) +
     "\nprecision: " + str(R_precision) + "\nrecall:    " + str(R_recall) + "\nf1:        " +
     str(R_f1))
print("====================================================================================")

print("SVM with increasing minor classes:\naccuracy:  " + str(I_accuracy) +
     "\nprecision: " + str(I_precision) + "\nrecall:    " + str(I_recall) + "\nf1:        " +
     str(I_f1))
print("====================================================================================")

print("SVM using minorcalsses with stop words:\naccuracy:  " + str(s_accuracy) +
     "\nprecision: " + str(s_precision) + "\nrecall:    " + str(s_recall) + "\nf1:        " +
     str(s_f1))

SVM using raw data:
accuracy:  0.746
precision: 0.47611676793583707
recall:    0.34186073270741457
f1:        0.3722400682974134
SVM with irrelevant classes:
accuracy:  0.754
precision: 0.6417058449316514
recall:    0.45924237852155475
f1:        0.4903006661209375
SVM with increasing minor classes:
accuracy:  0.762
precision: 0.6238407684248873
recall:    0.5092912712763972
f1:        0.5398158154713258
SVM using minorcalsses with stop words:
accuracy:  0.762
precision: 0.6238407684248873
recall:    0.5092912712763972
f1:        0.5398158154713258


# Selecting Top 10

In [15]:
# top ten recommendation function for a given model and test cases

import operator

def TopTen (Model, X_test, y_test):
    predict_y = Model.predict(X_test)    #get the prediction
    proba_y = Model.predict_proba(X_test)  #get the probablity of each class
    recommendation = dict()
    
    for c in Model.classes_:
        recommendation[c] = dict()
        
    for i in range(len(proba_y)):
        pred = predict_y[i]         # get the predict for this sample
        proba = np.max(proba_y[i])  # get how much probability for this prediction
        recommendation[pred][i + 9501] = proba  #save article number and probability into dict

    for key in recommendation:
        d = recommendation[key]
        sorted_d = dict(sorted(d.items(), key=operator.itemgetter(1),reverse=True))
        if (key == "IRRELEVANT"):   # skip irrelevant class
            continue
        print(key)
        count = 0
        for k in sorted_d:
            print("[" + str(count+1)+ "]  " + str(k) + "  " + str(sorted_d[k]))
            count += 1
            if count == 10:         # 10 articles been recommended
                break

In [16]:
clf = SVC(probability=True)
clf.fit(s_X, s_y) # chose the best performing SVC

TopTen(clf, X_valid, y_valid)

ARTS CULTURE ENTERTAINMENT
[1]  9703  0.8458798619729686
[2]  9789  0.8326215876740362
BIOGRAPHIES PERSONALITIES PEOPLE
[1]  9940  0.8716047418374151
[2]  9878  0.7549891388804013
[3]  9533  0.603109119524491
[4]  9933  0.5745796844147691
[5]  9526  0.46306355195764637
[6]  9988  0.4066193766747538
DEFENCE
[1]  9559  0.9985324969133195
[2]  9773  0.9703636516648217
[3]  9770  0.82113598086435
[4]  9670  0.8129892855209125
[5]  9576  0.755234346188909
[6]  9616  0.5277449828718751
DOMESTIC MARKETS
[1]  9994  0.9831742915339339
[2]  9989  0.9791124038992998
[3]  9640  0.5690126118867946
FOREX MARKETS
[1]  9588  0.9597499473742918
[2]  9682  0.9597499473742918
[3]  9798  0.9303403341826001
[4]  9986  0.9190169003131262
[5]  9530  0.9066419213826219
[6]  9565  0.8888635230218048
[7]  9786  0.8822681552128284
[8]  9977  0.8762509668339398
[9]  9718  0.8704086457115645
[10]  9772  0.8603337338998021
HEALTH
[1]  9810  0.951778575704747
[2]  9873  0.9514885638289249
[3]  9937  0.91361417828312