In [1]:
import glob
import os
import numpy as np
import pandas as pd
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
import random

In [3]:
folder_path = "D:/Projects/News Article classifier/01 Dataset Creation/Pickles//"

filename = "features_test.pickle"
with open(folder_path + filename, "rb") as file:
    features_test = pickle.load(file)
    
filename = "features_train.pickle"
with open(folder_path + filename, "rb") as file:
    features_train = pickle.load(file)
    
filename = "labels_test.pickle"
with open(folder_path + filename, "rb") as file:
    labels_test = pickle.load(file)

filename = "labels_train.pickle"
with open(folder_path + filename, "rb") as file:
    labels_train = pickle.load(file)
    
filename = "tfidf.pickle"
with open(folder_path + filename, "rb") as file:
    tfidf = pickle.load(file)
    
filename = "X_test.pickle"
with open(folder_path + filename, "rb") as file:
    X_test = pickle.load(file)
    
filename = "X_train.pickle"
with open(folder_path + filename, "rb") as file:
    X_train = pickle.load(file)
    
filename = "y_test.pickle"
with open(folder_path + filename, "rb") as file:
    y_test = pickle.load(file)
    
filename = "y_train.pickle"
with open(folder_path + filename, "rb") as file:
    y_train = pickle.load(file)
    
filename = "data.pickle"
with open(folder_path + filename, "rb") as file:
    data = pickle.load(file)

models_folder = "D://Projects//News Article classifier//02 Model Training//Models//"
with open(models_folder + "best_rfc.pickle", "rb") as file:
    model = pickle.load(file)


In [4]:
prediction = model.predict(features_test)

In [5]:
# Extracting Content column from data as so that content is in a readable formate for us. This is to analyse the misclassi     
content = data.loc[list(X_test.index), "Content"]

In [6]:
frame = {"Content" : content, 
         "Category_code" : y_test, 
         "Prediction" : prediction}
data_test = pd.DataFrame(frame)
data_test.head()

Unnamed: 0,Content,Category_code,Prediction
114,Glaxo aims high after profit fall\n\nGlaxoSmit...,0,0
602,Actor Foxx sees Globe nominations\n\nUS actor ...,1,1
963,Child access laws shake-up\n\nParents who refu...,2,2
1884,Microsoft releases patches\n\nMicrosoft has wa...,4,4
1080,Blair rejects Iraq advice calls\n\nTony Blair ...,2,2


In [7]:
category_names = {"0" : "Business",
                 "1" : "Entertainment",
                 "2" : "Politics",
                 "3" : "Sport",
                 "4" : "Tech"}

category_codes = {"Business" : 0,
                 "Entertainment" : 1,
                 "Politics" : 2,
                 "Sport": 3,
                 "Tech": 4}

data_test["Category"] = data_test["Category_code"].astype("str")
data_test = data_test.replace({"Category" : category_names })

data_test["Category_predicted"] = data_test["Prediction"].astype("str")
data_test = data_test.replace({"Category_predicted" : category_names })

In [8]:
condition = (data_test["Category"] != data_test["Category_predicted"])
df_misclassified = data_test[condition][["Content", "Category", "Category_predicted"]]

In [9]:
df_misclassified.head(3)

Unnamed: 0,Content,Category,Category_predicted
1289,Parliament's record of scandal\n\nIn a locked ...,Politics,Entertainment
2070,Humanoid robot learns how to run\n\nCar-maker ...,Tech,Business
1936,Portable PlayStation ready to go\n\nSony's Pla...,Tech,Entertainment


Let's inspect randomly 3 misclassified cases

In [10]:
def output_article(row_article):
    print('Actual Category: %s' %(row_article['Category']))
    print('Predicted Category: %s' %(row_article['Category_predicted']))
    print('-------------------------------------------')
    print('Text: ')
    print('%s' %(row_article['Content']))

In [18]:
random.seed(6)
list_samples = random.sample(list(df_misclassified.index), 3)
list_samples

[2043, 1936, 767]

In [19]:
output_article(df_misclassified.loc[list_samples[0]])

Actual Category: Tech
Predicted Category: Entertainment
-------------------------------------------
Text: 
Disney backs Sony DVD technology

A next generation DVD technology backed by Sony has received a major boost.

Film giant Disney says it will produce its future DVDs using Sony's Blu-ray Disc technology, but has not ruled out a rival format developed by Toshiba. The two competing DVD formats, Blu-ray developed by Sony and others, and Toshiba's HD-DVD, have been courting top film studios for several months. The next generation of DVDs promise very high quality pictures and sound, as well as a lot of data. Both technologies use a blue laser to write information. It has a shorter wavelength so more data can be stored. Disney is the latest studio to announce which technology it is backing in a format battle which mirrors the 1980s Betamax versus VHS war. Sony lost out to JVC in that fight.

The current battle for Hollywood's hearts and minds is a crucial one because high-definition fi

In [20]:
output_article(df_misclassified.loc[list_samples[1]])

Actual Category: Tech
Predicted Category: Entertainment
-------------------------------------------
Text: 
Portable PlayStation ready to go

Sony's PlayStation Portable (PSP) will go on sale in Japan on 12 December.

The long-awaited handheld game playing gadget will cost about 19,800 yen (145 euros) when it hits the shelves. At launch 21 games will be available for the PSP, including Need for Speed, Ridge Racer, Metal Gear Acid and Vampire Chronicle. Sony has not yet announced when the PSP will be available in Europe and the US, but analysts expect it to debut in those territories in early 2005.

Fifa 2005 is back at the top of the UK games charts, a week after losing it to rival Pro Evolution Soccer 4. Konami's Pro Evo dropped only one place to two, while the only new entry in the top 10 was another football title, LMA Manager 2005, in at number seven. Tony Hawk's Underground 2 held its own at three, while Star Wars Battlefront inched up to four places to four. There was good news fo

In [21]:
output_article(df_misclassified.loc[list_samples[2]])

Actual Category: Entertainment
Predicted Category: Business
-------------------------------------------
Text: 
Row threatens Hendrix museum plan

Proposals to open a museum dedicated to Jimi Hendrix are flailing because of a row over the home of his late father.

The run-down house in Seattle has already been moved wholesale once and local authorities are now demanding it be moved to another site. Hendrix supporters hoped to turn the home into a museum for the guitarist. "The mayor is going to go down as the mayor who destroyed Jimi Hendrix's house," said Ray Rae Marshall of the James Marshall Hendrix Foundation. The foundation moved the building, in which Al Hendrix lived between 1953 and 1956, when the land it was built on was to be developed for housing in 2002. Now the City of Seattle wants its new plot to be used for development, giving a deadline of 22 February for the home to be moved. Mr Goldman said the authority had promised the house could remain on its new site and be turne

We can see that in all cases, the category is not 100% clear, since these articles contain concepts of both categories. These errors will always happen and we are not looking forward to be 100% accurate on them.