In [1]:
import re
import pickle
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

import warnings

warnings.filterwarnings("ignore")

In [2]:
# English stopwords
stop_words = set(stopwords.words('english'))

### Data Collection

In [3]:
# https://www.kaggle.com/yufengdev/bbc-fulltext-and-category?select=bbc-text.csv
bbc_df = pd.read_csv("data/bbc-text.csv")
bbc_df.head(2)

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...


### Data Exploration

In [4]:
# size of the dataset
bbc_df.shape

(2225, 2)

In [5]:
# No of records for all type of category
bbc_df['category'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [6]:
# Check if any null record is present in dataset
bbc_df.isna().sum()

category    0
text        0
dtype: int64

In [7]:
# Check if any duplicate record is present 
bbc_df.duplicated().sum()

99

In [8]:
# Check duplicate records
bbc_df[bbc_df.duplicated()].sort_values('text')

Unnamed: 0,category,text
1755,tech,2d metal slug offers retro fun like some drill...
930,tech,apple attacked over sources row civil libertie...
1586,tech,apple ipod family expands market apple has exp...
1181,tech,apple unveils low-cost mac mini apple has un...
1325,tech,ask jeeves joins web log market ask jeeves has...
...,...,...
1992,tech,us peer-to-peer pirates convicted the first co...
2041,tech,virus poses as christmas e-mail security firms...
1774,tech,warning over tsunami aid website net users are...
956,tech,web radio takes spanish rap global spin the ra...


### Data Engineering

In [9]:
# Remove duplicate records
bbc_unique_df = bbc_df.drop_duplicates()
print(bbc_unique_df.shape)
bbc_unique_df.head(2)

(2126, 2)


Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...


In [10]:
# Index correction
bbc_unique_df.reset_index(inplace=True, drop=True)

In [11]:
# Each category value
bbc_unique_df.value_counts("category")

category
sport            504
business         503
politics         403
entertainment    369
tech             347
dtype: int64

In [12]:
bbc_unique_df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [13]:
# Punctuation removal Using regex
def punctuation_removal(test_str):
    clean_text = re.sub(r'[^\w\s]', ' ', test_str)
    clean_text = [i for i in clean_text.split() if len(i) > 0] # List format dataset
    clean_text = " ".join(clean_text).strip() # String dataset
    
    return  clean_text


# Remove stopwords
def stopwords_removal(sent):
    word_tokens = word_tokenize(sent)  
    filtered_sentence = [w for w in word_tokens if not w in stop_words] # List format
    filtered_sentence = " ".join(filtered_sentence) # String format
    
    return filtered_sentence


In [14]:
# Data Cleaning
all_clean_text = []

for idx in range(len(bbc_unique_df)):
    text = bbc_unique_df.loc[idx]['text']
    clean_text = punctuation_removal(text)
    clean_text = stopwords_removal(clean_text)
    
    all_clean_text.append(clean_text)
    


In [15]:
bbc_unique_df['clean_text'] = all_clean_text
bbc_unique_df.head(2)

Unnamed: 0,category,text,clean_text
0,tech,tv future in the hands of viewers with home th...,tv future hands viewers home theatre systems p...
1,business,worldcom boss left books alone former worldc...,worldcom boss left books alone former worldcom...


### Data Preparation

In [16]:
X = list(bbc_unique_df['clean_text'])
y = list(bbc_unique_df['category'])


In [17]:
#Split data in train and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)


### Model Training and Evaluation

In [18]:
#Use pipeline to carry out steps in sequence with a single object
#SVM's rbf kernel gives highest accuracy in this classification problem.
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SVC(kernel='rbf'))])


In [19]:
#train model
text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', SVC())])

In [100]:
# save the model to disk
filename = 'model/svm_model.sav'
pickle.dump(text_clf, open(filename, 'wb'))


In [101]:
# load the model from disk
filename = 'model/svm_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))


In [21]:
# Confusion Matrix
y_pred = text_clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[124,   1,   1,   0,   0],
       [  0,  88,   1,   0,   0],
       [  3,   0,  92,   0,   0],
       [  1,   0,   0, 117,   0],
       [  2,   2,   0,   0, 100]], dtype=int64)

In [23]:
124+88+92+117+100

521

In [137]:
# Classification Matrix
print(f1_score(y_test, y_pred, average='macro'))
print(f1_score(y_test, y_pred, average='weighted'))
print(f1_score(y_test, y_pred, average='macro'))

0.9890301388022038
0.9887333275066503
0.9890301388022038


In [139]:
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

     business       0.98      0.98      0.98       126
entertainment       1.00      0.99      0.99        93
     politics       0.98      0.98      0.98       101
        sport       0.99      1.00      1.00       128
         tech       1.00      0.99      0.99        84

     accuracy                           0.99       532
    macro avg       0.99      0.99      0.99       532
 weighted avg       0.99      0.99      0.99       532



In [140]:
accuracy_score(y_test, y_pred)

0.9887218045112782

### Model Hyperparameter Tuning

In [112]:
cv_tfidf = Pipeline([('count', CountVectorizer()), ('tfid', TfidfTransformer())])


In [115]:
X_tfidf = cv_tfidf.fit_transform(all_clean_text).toarray()
X_tfidf.shape

(2126, 29279)

In [121]:
y = np.array(bbc_unique_df['category'])
y.shape

(2126,)

In [122]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.25)

In [124]:
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100],
            'gamma': [1, 0.1, 0.01, 0.001],
            'kernel': ['rbf', 'linear']}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(X_train, y_train)


Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.241, total= 1.5min
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.238, total= 1.5min
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.9min remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.238, total= 1.5min
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.238, total= 1.5min
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.239, total= 1.5min
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.765, total= 1.4min
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.768, total= 1.4min
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.715, total= 1.4min
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.771, total= 1.4min
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] .

[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.238, total= 1.4min
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.238, total= 1.4min
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.238, total= 1.4min
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.238, total= 1.4min
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.236, total= 1.4min
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV] ...... C=1, gamma=0.01, kernel=linear, score=0.966, total= 1.3min
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV] ...... C=1, gamma=0.01, kernel=linear, score=0.981, total= 1.3min
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV] .

[CV] .... C=10, gamma=0.001, kernel=linear, score=0.981, total= 1.3min
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV] .... C=10, gamma=0.001, kernel=linear, score=0.965, total= 1.3min
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] .......... C=100, gamma=1, kernel=rbf, score=0.966, total= 1.5min
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] .......... C=100, gamma=1, kernel=rbf, score=0.978, total= 1.5min
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] .......... C=100, gamma=1, kernel=rbf, score=0.966, total= 1.5min
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] .......... C=100, gamma=1, kernel=rbf, score=0.981, total= 1.5min
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] .......... C=100, gamma=1, kernel=rbf, score=0.959, total= 1.5min
[CV] C=100, gamma=1, kernel=linear ...................................
[CV] .

[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed: 219.7min finished


GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001],
                         'kernel': ['rbf', 'linear']},
             verbose=3)

In [125]:
# print best parameter after tuning
print(grid.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 1, 'gamma': 1, 'kernel': 'linear'}
SVC(C=1, gamma=1, kernel='linear')


In [142]:
final_text_clf = Pipeline([('vect', CountVectorizer()), 
                           ('tfidf', TfidfTransformer()), 
                           ('clf', SVC(kernel='linear', C=1, gamma=1))])

final_text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', SVC(C=1, gamma=1, kernel='linear'))])

In [143]:
# Confusion Matrix
y_pred = final_text_clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[120,   0,   4,   0,   2],
       [  1,  91,   1,   0,   0],
       [  2,   0,  99,   0,   0],
       [  0,   0,   0, 128,   0],
       [  1,   0,   0,   1,  82]], dtype=int64)

In [144]:
# Classification Matrix
print(f1_score(y_test, y_pred, average='macro'))
print(f1_score(y_test, y_pred, average='weighted'))
print(f1_score(y_test, y_pred, average='macro'))

0.9774567037852024
0.9774479238200555
0.9774567037852024


In [145]:
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

     business       0.97      0.95      0.96       126
entertainment       1.00      0.98      0.99        93
     politics       0.95      0.98      0.97       101
        sport       0.99      1.00      1.00       128
         tech       0.98      0.98      0.98        84

     accuracy                           0.98       532
    macro avg       0.98      0.98      0.98       532
 weighted avg       0.98      0.98      0.98       532



In [146]:
accuracy_score(y_test, y_pred)

0.9774436090225563

In [147]:
# Final save the model to disk
filename = 'model/fina_svm_model.sav'
pickle.dump(text_clf, open(filename, 'wb'))


#### Final model accuracy is 97.74%