## Data Fetching

The dataset used to train this model was retrieved from http://mlg.ucd.ie/datasets/bbc.html

In [8]:
import zipfile

# Define the path to the zip file
zip_file_path = 'data.zip'

# Define the directory where you want to extract the files
extracted_dir = ''

# Open the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all the contents to the specified directory
    zip_ref.extractall(extracted_dir)

print(f'Files extracted')

Files extracted


In [9]:
import os
import pandas as pd

cat_article = []
for subdir, dirs, files in os.walk('data'):
  for file in files:
    if '.txt' in file:
      category = subdir.split('/')[-1]
      f = open(os.path.join(subdir, file),'r', encoding='utf-8', errors='ignore')
      lines = f.readlines()
      lines = ' '.join(lines).replace('\n','')
      #list of lists: [category,article]
      cat_article.append([category,lines])
      f.close()

data = pd.DataFrame(cat_article)
data.columns = ['category','article']

In [10]:
articles_df = data
articles_df.head()

Unnamed: 0,category,article
0,sport,Edwards tips Idowu for Euro gold World outdoo...
1,sport,McCall earns Tannadice reprieve Dundee United...
2,sport,Pearce keen on succeeding Keegan Joint assist...
3,sport,Woodward eyes Brennan for Lions Toulouse's fo...
4,sport,Pountney handed ban and fine Northampton coac...


In [36]:
articles_df['category'].unique()

array(['sport', 'entertainment', 'tech', 'business', 'politics'],
      dtype=object)

In [5]:
articles_df.isnull().sum()

category    0
article     0
dtype: int64

In [6]:
articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  2225 non-null   object
 1   article   2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB


In [7]:
articles_df.category.value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

## Text processing

We declare this tokenizer to perform:

* Tokenize
* Lemmatize each token  
* Lower case each token
* Remove stop words
* Remove punctiations



In [12]:
import string
import spacy
nlp = spacy.load('en_core_web_sm')

def spacy_tokenizer(doc):
        """Function that serves as tokenizer in our pipeline
        Loads the 'en_core_web_sm' model, tokenize the string and perform pre processing.
        Preprocessing includes lemmatizing tokens as well as removing stop words and punctuations.
        Args:
            doc(str): sentence to tokenize.
        Returns:
            list: preprocessed tokens.
        """

        punctuations = string.punctuation

        stop_words = spacy.lang.en.stop_words.STOP_WORDS
        tokens = nlp(doc)#Tokenization

        # Lemmatizing each token and converting each token into lowercase
        tokens = [word.lemma_.lower() for word in tokens if not word.is_space]
        # Removing stop words and punctuations
        tokens = [ word for word in tokens if word not in stop_words and word not in punctuations ]
        # return preprocessed list of tokens
        return tokens

## Feature Extraction

Perform TFIDF together with the previous tokenizer

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer,min_df=3)

We split the data and then apply TFIDF

In [38]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(articles_df['article'], articles_df['category'], test_size = 0.2, random_state = 0, shuffle = True)
print(len(x_train))
print(len(x_test))

1780
445


In [46]:
# Tf-Idf transformation
xtrain_tfidf = vectorizer.fit_transform(x_train)
xtest_tfidf = vectorizer.transform(x_test)



##Train Model

In [47]:
perform_list = []

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score

def run_model(model_name, est_c, est_pnlty):
    mdl = ""
    if model_name == 'Logistic Regression':
        mdl = LogisticRegression()
    elif model_name == 'Random Forest':
        mdl = RandomForestClassifier(n_estimators=100 ,criterion='entropy', random_state=0)
    elif model_name == 'Support Vector Classifer':
        mdl = SVC()
    elif model_name == 'Gaussian Naive Bayes':
        mdl = GaussianNB()
    oneVsRest = OneVsRestClassifier(mdl)
    oneVsRest.fit(xtrain_tfidf.toarray(), y_train)
    y_pred = oneVsRest.predict(xtest_tfidf.toarray())
    # Performance metrics
    accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
    # Get precision, recall, f1 scores
    precision, recall, f1score, support = score(y_test, y_pred, average='micro')
    print(f'Test Accuracy Score of Basic {model_name}: % {accuracy}')
    print(f'Precision : {precision}')
    print(f'Recall : {recall}')
    print(f'F1-score : {f1score}')
    # Add performance parameters to list
    perform_list.append(dict([('Model', model_name),('Test Accuracy', round(accuracy, 2)),('Precision', round(precision, 2)),('Recall', round(recall, 2)),('F1', round(f1score, 2))]))

In [60]:
run_model('Logistic Regression', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Logistic Regression: % 96.63
Precision : 0.9662921348314607
Recall : 0.9662921348314607
F1-score : 0.9662921348314607


In [62]:
run_model('Random Forest', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Random Forest: % 96.18
Precision : 0.9617977528089887
Recall : 0.9617977528089887
F1-score : 0.9617977528089887


In [63]:
run_model('Support Vector Classifer', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Support Vector Classifer: % 96.63
Precision : 0.9662921348314607
Recall : 0.9662921348314607
F1-score : 0.9662921348314607


In [61]:
run_model('Gaussian Naive Bayes', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Gaussian Naive Bayes: % 77.98
Precision : 0.7797752808988764
Recall : 0.7797752808988764
F1-score : 0.7797752808988764


In [64]:
model_performance = pd.DataFrame(data=perform_list)
model_performance = model_performance[['Model', 'Test Accuracy', 'Precision', 'Recall', 'F1']]
model_performance

Unnamed: 0,Model,Test Accuracy,Precision,Recall,F1
0,Logistic Regression,96.63,0.97,0.97,0.97
1,Random Forest,96.18,0.96,0.96,0.96
2,Support Vector Classifer,96.63,0.97,0.97,0.97
3,Logistic Regression,96.63,0.97,0.97,0.97
4,Gaussian Naive Bayes,77.98,0.78,0.78,0.78
5,Random Forest,96.18,0.96,0.96,0.96
6,Support Vector Classifer,96.63,0.97,0.97,0.97


## Pipeline Creation

Although Logistic Regression and SVC both had higher accuracy, we are going to use random forest classifier because of the better interpretability. We want to get the feature importances.

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

text_clf = Pipeline([('tfidf', TfidfVectorizer(tokenizer=spacy_tokenizer,min_df=3)),\
                     ('clf', RandomForestClassifier())])

In [14]:
text_clf.fit(articles_df['article'], articles_df['category'])



In [15]:
from pickle import dump, load
model_file = "rm_tfidf.pkl"
dump(text_clf, open(model_file, 'wb'))

## Load and Predict

In [67]:
import joblib

model = joblib.load("rm_tfidf.pkl")

In [None]:
model.steps[0][1].vocabulary_.get()

In [None]:
with open('test_business.txt', 'r') as file:
    # Read the entire content of the file
    content = file.read()
    print(content)

In [76]:
print(model.predict([content]))
print(model.predict_proba([content]))

['business']
[[0.45 0.22 0.18 0.09 0.06]]


In [77]:
tokens = spacy_tokenizer(content)
arr = []
for token in tokens:
  #get the index of the token in the model's vocabulary
  idx = model.steps[0][1].vocabulary_.get(token)
  if idx is not None:#Some tokens doesnt appear in the corpus.
    importance = model.steps[1][1].feature_importances_[idx]
    arr.append({'TOKEN':token, 'Importance':importance})

  imp_df = pd.DataFrame(arr)
  top_imp_df = imp_df.groupby(['TOKEN','Importance'], as_index = False).count().sort_values(by = 'Importance',ascending = False).set_index('TOKEN').head(10)

In [78]:
top_imp_df

Unnamed: 0_level_0,Importance
TOKEN,Unnamed: 1_level_1
mr,0.011477
bank,0.005032
include,0.003598
state,0.002042
year,0.001432
work,0.001401
executive,0.00112
bbc,0.001039
report,0.000914
boss,0.000801
