Importing all the required libraries, modules and dataset

In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

# spacy is our NLP library
import spacy
from spacy.tokenizer import Tokenizer

# importing an NLP model
from sklearn.svm import SVC

# Additional libraries for text processing
from sklearn.feature_extraction.text import TfidfVectorizer

In [44]:
df = pd.read_csv("train.csv")

In [45]:
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


Creating the spaCy tokenizer and text pre-processing function

In [46]:
# Load the spaCy language model
nlp = spacy.load("en_core_web_sm")

# Create a spaCy tokenizer
tokenizer = Tokenizer(nlp.vocab)

In [47]:
def preprocess_text(text):
    doc = tokenizer(text)
    return " ".join([token.text.lower() for token in doc if not token.is_stop])

# Tokenization means breaking a sentence down into tokens or invidual (meaningful) words

In [48]:
df["text"] = df["text"].apply(preprocess_text)

In [49]:
df.loc[:,"text"]

0       tv future hands viewers home theatre systems  ...
1       worldcom boss   left books   worldcom boss ber...
2       tigers wary farrell   gamble   leicester rushe...
3       yeading face newcastle fa cup premiership newc...
4       ocean s raids box office ocean s   crime caper...
                              ...                        
2220    cars pull retail figures retail sales fell 0.3...
2221    kilroy unveils immigration policy ex-chatshow ...
2222    rem announce new glasgow concert band rem anno...
2223    political squabbles snowball s commonplace arg...
2224    souness delight euro progress boss graeme soun...
Name: text, Length: 2225, dtype: object

Splitting the dataset into train and test datasets

In [50]:
X = df["text"]      
y = df["category"]
# X contains the data using which we predict y

In [54]:
# random_state can be any real number such as 0 also
# here train : test size ratio is 3:1 (75% : 25%)
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.25, random_state = 42)

Feature-Engineering the datasets

In [63]:
tfidf_vectorizer = TfidfVectorizer (max_features = 2000, ngram_range = (1, 2))
# we can play around with these parameters to optimise the model further

# Combine feature extraction and classifier into a pipeline
model = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('clf', SVC (kernel = 'linear', C = 1, probability = True)) 
])

Fitting the model on the train dataset

In [64]:
model.fit(X_train,y_train)

Predictions on the test dataset and checking its accuracy

In [65]:
# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


               precision    recall  f1-score   support

     business       0.97      0.92      0.94       136
entertainment       0.95      0.97      0.96        96
     politics       0.92      0.99      0.96        98
        sport       0.98      0.98      0.98       124
         tech       0.99      0.96      0.98       103

     accuracy                           0.96       557
    macro avg       0.96      0.96      0.96       557
 weighted avg       0.96      0.96      0.96       557



In [69]:
# A high accuracy indicates our model is working well.
# (At the same time we need to beware of overfitting it to specific cases)

Saving the trained model using 'joblib'

In [67]:
from joblib import dump
dump(model, 'trained_model.joblib')

['trained_model.joblib']