In [7]:
import pandas as pd
import os

data_folder = "BBC Dataset"
classes = ["business","entertainment","politics","sport","tech"]
x = []
y = []

for i in classes:
    files = os.listdir(i)
    for text_file in files:
        file_path = i + "/" +text_file
        with open(file_path) as f:
            data = f.readlines()
        data = ' '.join(data)
        x.append(data)
        y.append(i)
   
data = {'content': x, 'class': y}       
df = pd.DataFrame(data)
df.to_csv('bbc_news_data_5_classes.csv', index=False)

In [8]:
df_news = pd.read_csv("bbc_news_data_5_classes.csv")

In [9]:
df_news

Unnamed: 0,content,class
0,Ad sales boost Time Warner profit\n \n Quarter...,business
1,Dollar gains on Greenspan speech\n \n The doll...,business
2,Yukos unit buyer faces loan claim\n \n The own...,business
3,High fuel prices hit BA's profits\n \n British...,business
4,Pernod takeover talk lifts Domecq\n \n Shares ...,business
5,Japan narrowly escapes recession\n \n Japan's ...,business
6,Jobs growth still slow in the US\n \n The US c...,business
7,"India calls for fair trade rules\n \n India, w...",business
8,Ethiopia's crop production up 24%\n \n Ethiopi...,business
9,Court rejects $280bn tobacco case\n \n A US go...,business


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import Word
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix


def clean_str(string):
    """
    Tokenization/string cleaning for datasets.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"\'s", "", string)
    string = re.sub(r"\'ve", "", string)
    string = re.sub(r"n\'t", "", string)
    string = re.sub(r"\'re", "", string)
    string = re.sub(r"\'d", "", string)
    string = re.sub(r"\'ll", "", string)
    string = re.sub(r",", "", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", "", string)
    string = re.sub(r"\)", "", string)
    string = re.sub(r"\?", "", string)
    string = re.sub(r"'", "", string)
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"[0-9]\w+|[0-9]","", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

data = pd.read_csv("bbc_news_data_5_classes.csv")
x = data['content'].tolist()
y = data['class'].tolist()

for index,value in enumerate(x):
    x[index] = ' '.join([Word(word).lemmatize() for word in clean_str(value).split()])

vect = TfidfVectorizer(stop_words='english',min_df=2)
X = vect.fit_transform(x)
Y = np.array(y)

In [12]:
X.shape

(2225, 14788)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [14]:
print(X_train.shape)
print(X_test.shape)

(1780, 14788)
(445, 14788)


In [15]:
model = RandomForestClassifier(n_estimators=400, max_depth=150,n_jobs=1)
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=150, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [16]:
prediction = model.predict(X_test)

In [17]:
prediction

array(['business', 'business', 'sport', 'business', 'politics', 'sport',
       'sport', 'politics', 'sport', 'politics', 'business',
       'entertainment', 'sport', 'business', 'entertainment', 'business',
       'tech', 'entertainment', 'business', 'entertainment', 'business',
       'sport', 'business', 'entertainment', 'business', 'sport',
       'entertainment', 'sport', 'politics', 'sport', 'sport', 'politics',
       'tech', 'entertainment', 'business', 'business', 'business',
       'sport', 'tech', 'sport', 'tech', 'politics', 'business', 'sport',
       'tech', 'tech', 'tech', 'business', 'business', 'tech',
       'entertainment', 'business', 'politics', 'business',
       'entertainment', 'politics', 'politics', 'politics', 'business',
       'business', 'business', 'business', 'entertainment', 'business',
       'sport', 'entertainment', 'business', 'business', 'politics',
       'sport', 'politics', 'politics', 'tech', 'sport', 'business',
       'sport', 'sport', 'enter

In [18]:
matrix = confusion_matrix(y_test,prediction)

In [19]:
matrix

array([[113,   0,   2,   0,   0],
       [  2,  67,   2,   1,   0],
       [  1,   0,  73,   1,   1],
       [  1,   0,   0, 101,   0],
       [  2,   1,   0,   1,  76]], dtype=int64)

In [23]:
acc = accuracy_score(y_test,prediction)

In [24]:
print("Accuracy-", acc)

Accuracy- 0.9662921348314607
