In this notebook, we're going to train model for classifying BBC article category. [The dataset was downloaded from this page](https://www.kaggle.com/c/learn-ai-bbc/overview).
Some of the code in this notebook were taken from kaggle notebook.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import csv
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import text_to_word_sequence
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [11]:
# for nlp task
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Roisyah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Roisyah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# read dataset

with open("./dataset/BBC News Train.csv", 'r') as csvfile:
    print(f"First line (header) looks like this:\n\n{csvfile.readline()}")
    print(f"Each data point looks like this:\n\n{csvfile.readline()}")  

First line (header) looks like this:

ArticleId,Text,Category

Each data point looks like this:




In [4]:
df_train = pd.read_csv('dataset/BBC News Train.csv')
df_train.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [5]:
df_train['Category'].unique()

array(['business', 'tech', 'politics', 'sport', 'entertainment'],
      dtype=object)

In [6]:
df_train['Category'].value_counts()

sport            346
business         336
politics         274
entertainment    273
tech             261
Name: Category, dtype: int64

In [5]:
# encode the Category
df_train.loc[df_train['Category'] == 'sport', 'Label'] = 0
df_train.loc[df_train['Category'] == 'business', 'Label'] = 1
df_train.loc[df_train['Category'] == 'politics', 'Label'] = 2
df_train.loc[df_train['Category'] == 'entertainment', 'Label'] = 3
df_train.loc[df_train['Category'] == 'tech', 'Label'] = 4

labels = to_categorical(df_train['Label'], num_classes=5)
df_train['Label'] = df_train['Label'].astype(int)
print(df_train['Label'][:10])

0    1
1    1
2    1
3    4
4    1
5    2
6    0
7    3
8    1
9    3
Name: Label, dtype: int32


In [6]:
df_train.head()

Unnamed: 0,ArticleId,Text,Category,Label
0,1833,worldcom ex-boss launches defence lawyers defe...,business,1
1,154,german business confidence slides german busin...,business,1
2,1101,bbc poll indicates economic gloom citizens in ...,business,1
3,1976,lifestyle governs mobile choice faster bett...,tech,4
4,917,enron bosses in $168m payout eighteen former e...,business,1


In [7]:
df_test = pd.read_csv('dataset/BBC News Test.csv')
df_test.head()

Unnamed: 0,ArticleId,Text
0,1018,qpr keeper day heads for preston queens park r...
1,1319,software watching while you work software that...
2,1138,d arcy injury adds to ireland woe gordon d arc...
3,459,india s reliance family feud heats up the ongo...
4,1020,boro suffer morrison injury blow middlesbrough...


# TEXT PREPROCESSING

1. Tokenizing the text
2. Coverting the text to lowercase
3. Lemmatization (Stemming) of the text
4. Removing punctuation from the text [SKIP FOR NOW]
5. Removing stopwords from the text

In [None]:
punc = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'

In [12]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
stopwords = stopwords.words('english')

In [13]:
def message_to_token_list(s):
    
    tokens = tokenizer.tokenize(s)
    lowercased_tokens = [t.lower() for t in tokens]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(t) for t in lowercased_tokens]
    useful_tokens = [t for t in lemmatized_tokens if t not in stopwords]
    
    return useful_tokens

In [14]:
# test above function
txt = "last week  mci agreed to a buyout by verizon communications in a deal valued at $6.75bn."
message_to_token_list(txt)

['last',
 'week',
 'mci',
 'agreed',
 'buyout',
 'verizon',
 'communication',
 'deal',
 'valued',
 '6',
 '75bn']

In [15]:
df_train['Text Processed'] = df_train['Text'].apply(message_to_token_list) # apply above function to our dataset
df_train.head()

Unnamed: 0,ArticleId,Text,Category,Label,Text Processed
0,1833,worldcom ex-boss launches defence lawyers defe...,business,1,"[worldcom, ex, bos, launch, defence, lawyer, d..."
1,154,german business confidence slides german busin...,business,1,"[german, business, confidence, slide, german, ..."
2,1101,bbc poll indicates economic gloom citizens in ...,business,1,"[bbc, poll, indicates, economic, gloom, citize..."
3,1976,lifestyle governs mobile choice faster bett...,tech,4,"[lifestyle, governs, mobile, choice, faster, b..."
4,917,enron bosses in $168m payout eighteen former e...,business,1,"[enron, boss, 168m, payout, eighteen, former, ..."


In [17]:
 #join words into sentence
df_final = df_train.copy()
df_final['Text Processed Join'] = df_final['Text Processed'].str.join(' ')
df_final = df_final.drop(columns=['ArticleId', 'Text', 'Category', 'Text Processed'], axis=1)
df_final.head()

Unnamed: 0,Label,Text Processed Join
0,1,worldcom ex bos launch defence lawyer defendin...
1,1,german business confidence slide german busine...
2,1,bbc poll indicates economic gloom citizen majo...
3,4,lifestyle governs mobile choice faster better ...
4,1,enron boss 168m payout eighteen former enron d...


In [18]:
texts = df_final.iloc[:,1]
labels = df_final.iloc[:,0]

training_portion = .8
train_size = int(len(df_final['Text Processed Join']) * training_portion)

# split df_final into train and validation set
train_texts = texts[:train_size]
train_labels = labels[:train_size]
val_texts = texts[train_size:]
val_labels = labels[train_size:]

print(train_texts.shape, train_labels.shape, val_texts.shape, val_labels.shape)

(1192,) (1192,) (298,) (298,)


# Modeling

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

### Naive Bayes

In [46]:
nb = Pipeline([('tfidf', TfidfVectorizer()),
               ('clf', MultinomialNB()),
              ])

nb.fit(train_texts, train_labels)

valid_predict_nb = nb.predict(val_texts)

train_accuracy = nb.score(train_texts, train_labels)*100
valid_accuracy_nb = accuracy_score(valid_predict_nb, val_labels)*100

print("Naive Bayes Train Accuracy Score : {}% ".format(train_accuracy))
print("Naive Bayes Validation Accuracy Score  : {}% ".format(valid_accuracy_nb))
print()
print(classification_report(valid_predict_nb, val_labels))

Naive Bayes Train Accuracy Score : 99.16107382550335% 
Naive Bayes Validation Accuracy Score  : 96.64429530201343% 

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        61
           1       0.97      0.97      0.97        68
           2       0.93      0.95      0.94        58
           3       0.94      1.00      0.97        60
           4       1.00      0.92      0.96        51

    accuracy                           0.97       298
   macro avg       0.97      0.96      0.97       298
weighted avg       0.97      0.97      0.97       298



### Decision Tree

In [48]:
dt = Pipeline([('tfidf', TfidfVectorizer()),
                ('dt', DecisionTreeClassifier()),
               ])

dt.fit(train_texts, train_labels)

valid_predict_dt = dt.predict(val_texts)

train_accuracy = nb.score(train_texts, train_labels)*100
valid_accuracy_dt = accuracy_score(valid_predict_dt, val_labels)*100

print("Decision Tree Train Accuracy Score : {}% ".format(train_accuracy))
print("Decision Tree Validation Accuracy Score  : {}% ".format(valid_accuracy_dt))
print()
print(classification_report(valid_predict_dt, val_labels))

Decision Tree Train Accuracy Score : 99.16107382550335% 
Decision Tree Validation Accuracy Score  : 81.54362416107382% 

              precision    recall  f1-score   support

           0       0.87      0.87      0.87        60
           1       0.79      0.82      0.81        66
           2       0.81      0.76      0.79        63
           3       0.80      0.82      0.81        62
           4       0.81      0.81      0.81        47

    accuracy                           0.82       298
   macro avg       0.82      0.82      0.82       298
weighted avg       0.82      0.82      0.82       298



### Random Forest Classifier

In [49]:
rfc = Pipeline([('tfidf', TfidfVectorizer()),
                ('rfc', RandomForestClassifier(n_estimators=100)),
               ])

rfc.fit(train_texts, train_labels)

valid_predict_rfc = rfc.predict(val_texts)

train_accuracy = nb.score(train_texts, train_labels)*100
valid_accuracy_rfc = accuracy_score(valid_predict_rfc, val_labels)*100

print("Decision Tree Train Accuracy Score : {}% ".format(train_accuracy))
print("Decision Tree Validation Accuracy Score  : {}% ".format(valid_accuracy_rfc))
print()
print(classification_report(valid_predict_rfc, val_labels))

Decision Tree Train Accuracy Score : 99.16107382550335% 
Decision Tree Validation Accuracy Score  : 95.9731543624161% 

              precision    recall  f1-score   support

           0       0.98      0.97      0.98        61
           1       0.96      0.93      0.94        70
           2       0.95      0.93      0.94        60
           3       0.92      1.00      0.96        59
           4       1.00      0.98      0.99        48

    accuracy                           0.96       298
   macro avg       0.96      0.96      0.96       298
weighted avg       0.96      0.96      0.96       298



## Model Prediction

In [24]:
# preprocess my text df
df_test['Text Processed'] = df_test['Text'].apply(message_to_token_list)
df_test['Text Processed Join'] = df_test['Text Processed'].str.join(' ')
df_test.head()

Unnamed: 0,ArticleId,Text,Text Processed,Text Processed Join
0,1018,qpr keeper day heads for preston queens park r...,"[qpr, keeper, day, head, preston, queen, park,...",qpr keeper day head preston queen park ranger ...
1,1319,software watching while you work software that...,"[software, watching, work, software, monitor, ...",software watching work software monitor every ...
2,1138,d arcy injury adds to ireland woe gordon d arc...,"[arcy, injury, add, ireland, woe, gordon, arcy...",arcy injury add ireland woe gordon arcy ha rul...
3,459,india s reliance family feud heats up the ongo...,"[india, reliance, family, feud, heat, ongoing,...",india reliance family feud heat ongoing public...
4,1020,boro suffer morrison injury blow middlesbrough...,"[boro, suffer, morrison, injury, blow, middles...",boro suffer morrison injury blow middlesbrough...


In [25]:
test_texts = df_test.iloc[:,3] 
test_predict = nb.predict(test_texts)
test_predict

array([0, 4, 0, 1, 0, 0, 2, 2, 3, 1, 1, 4, 2, 4, 3, 0, 2, 4, 3, 2, 1, 2,
       0, 1, 2, 0, 1, 0, 0, 1, 2, 4, 1, 1, 0, 0, 0, 1, 3, 1, 4, 2, 3, 4,
       0, 4, 3, 1, 2, 1, 2, 1, 1, 1, 4, 2, 4, 3, 0, 4, 0, 3, 4, 2, 1, 3,
       0, 4, 0, 0, 4, 0, 1, 2, 4, 0, 4, 4, 4, 3, 2, 0, 3, 3, 1, 3, 1, 3,
       1, 4, 1, 2, 0, 4, 0, 0, 0, 0, 0, 0, 2, 0, 2, 3, 1, 0, 2, 0, 2, 3,
       0, 1, 3, 0, 2, 0, 2, 0, 2, 1, 3, 1, 3, 3, 4, 0, 1, 3, 1, 3, 1, 2,
       2, 4, 1, 1, 2, 4, 3, 0, 1, 4, 0, 3, 2, 0, 0, 3, 3, 4, 1, 4, 2, 3,
       0, 0, 0, 0, 3, 4, 1, 4, 1, 4, 1, 4, 3, 4, 4, 2, 1, 2, 1, 1, 3, 2,
       4, 1, 1, 4, 0, 2, 0, 2, 4, 4, 2, 1, 2, 4, 2, 1, 3, 0, 4, 4, 1, 4,
       2, 1, 0, 2, 1, 3, 1, 1, 0, 4, 1, 0, 3, 3, 0, 3, 0, 4, 2, 3, 0, 3,
       0, 3, 2, 1, 4, 3, 1, 2, 1, 4, 1, 0, 2, 2, 2, 2, 0, 1, 1, 2, 0, 2,
       1, 0, 4, 1, 2, 1, 2, 1, 1, 0, 4, 2, 3, 4, 3, 4, 0, 0, 4, 0, 0, 0,
       3, 0, 2, 4, 1, 0, 1, 0, 1, 0, 3, 1, 1, 3, 2, 1, 0, 0, 4, 0, 0, 3,
       1, 0, 4, 2, 3, 1, 1, 2, 0, 3, 2, 1, 0, 0, 4,

In [27]:
df_test['Label'] = pd.DataFrame(test_predict)
df_test.head()

Unnamed: 0,ArticleId,Text,Text Processed,Text Processed Join,Label
0,1018,qpr keeper day heads for preston queens park r...,"[qpr, keeper, day, head, preston, queen, park,...",qpr keeper day head preston queen park ranger ...,0
1,1319,software watching while you work software that...,"[software, watching, work, software, monitor, ...",software watching work software monitor every ...,4
2,1138,d arcy injury adds to ireland woe gordon d arc...,"[arcy, injury, add, ireland, woe, gordon, arcy...",arcy injury add ireland woe gordon arcy ha rul...,0
3,459,india s reliance family feud heats up the ongo...,"[india, reliance, family, feud, heat, ongoing,...",india reliance family feud heat ongoing public...,1
4,1020,boro suffer morrison injury blow middlesbrough...,"[boro, suffer, morrison, injury, blow, middles...",boro suffer morrison injury blow middlesbrough...,0


# Submission

In [40]:
texts_id = df_test.iloc[:,0]
test_predict_num = df_test.iloc[:,4]

submission_num = pd.DataFrame(list(zip(texts_id, test_predict_num)),
               columns =['ArticleId', 'Label'])
submission_num.to_csv('submission_num.csv', index=False)
submission_num.head(20)

Unnamed: 0,ArticleId,Label
0,1018,0
1,1319,4
2,1138,0
3,459,1
4,1020,0
5,51,0
6,2025,2
7,1479,2
8,27,3
9,397,1


In [36]:
def label_to_category(label):
    if label == 0:
        return 'sport'
    elif label == 1:
        return 'business'
    elif label == 2:
        return 'politics'
    elif label == 3:
        return 'entertainment'
    else:
        return 'tech'

In [37]:
df_test['Category'] = df_test['Label'].apply(label_to_category)
print(df_test['Category'][:10])

0            sport
1             tech
2            sport
3         business
4            sport
5            sport
6         politics
7         politics
8    entertainment
9         business
Name: Category, dtype: object


In [38]:
df_test.head()

Unnamed: 0,ArticleId,Text,Text Processed,Text Processed Join,Label,Category
0,1018,qpr keeper day heads for preston queens park r...,"[qpr, keeper, day, head, preston, queen, park,...",qpr keeper day head preston queen park ranger ...,0,sport
1,1319,software watching while you work software that...,"[software, watching, work, software, monitor, ...",software watching work software monitor every ...,4,tech
2,1138,d arcy injury adds to ireland woe gordon d arc...,"[arcy, injury, add, ireland, woe, gordon, arcy...",arcy injury add ireland woe gordon arcy ha rul...,0,sport
3,459,india s reliance family feud heats up the ongo...,"[india, reliance, family, feud, heat, ongoing,...",india reliance family feud heat ongoing public...,1,business
4,1020,boro suffer morrison injury blow middlesbrough...,"[boro, suffer, morrison, injury, blow, middles...",boro suffer morrison injury blow middlesbrough...,0,sport


In [41]:
texts_id = df_test.iloc[:,0]
test_predict_cat = df_test.iloc[:,5]

submission_cat = pd.DataFrame(list(zip(texts_id, test_predict_cat)),
               columns =['ArticleId', 'Category'])
submission_cat.to_csv('submission_cat.csv', index=False)
submission_cat.head(20)

Unnamed: 0,ArticleId,Category
0,1018,sport
1,1319,tech
2,1138,sport
3,459,business
4,1020,sport
5,51,sport
6,2025,politics
7,1479,politics
8,27,entertainment
9,397,business
