In [8]:
import spacy
nlp = spacy.load('en_core_web_sm', disable=['ner'])
nlp.remove_pipe('tagger')
nlp.remove_pipe('parser')

('parser', <spacy.pipeline.pipes.DependencyParser at 0x7fb85ed44a68>)

In [2]:
import csv
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv("drive/My Drive/BBC News.csv")

In [4]:
data.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [5]:
data = data.sample(frac=1) #avoiding possible bias
split = int(len(data) * 0.8)
tmp_train = data.iloc[:split,:]
test_data = data.iloc[split:,:]

# 2. Splitting train data into a train/validation split that's 80% train, 20% validation 
validation_split = int(split * 0.8)
train_data = tmp_train.iloc[:validation_split,:]
validation_data = tmp_train.iloc[validation_split:,:]

In [6]:
print (len(train_data))
print (len(validation_data))
print (len(test_data))

953
239
298


In [12]:
train_data_X = train_data["Text"]
train_data_Y = train_data["Category"]

validation_data_X = validation_data["Text"]
validation_data_Y = validation_data["Category"]

test_data_X = test_data["Text"]
test_data_Y = test_data["Category"]

In [13]:
#Tokenize
def spacy_tokenize(string):
  tokens = list()
  doc = nlp(string)
  for token in doc:
    tokens.append(token)
  return tokens

#Normalize
def normalize(tokens):
  normalized_tokens = list()
  for token in tokens:
    normalized = token.text.lower().strip()
    if ((token.is_alpha or token.is_digit)):
      normalized_tokens.append(normalized)
  return normalized_tokens
  return normalized_tokens

#Tokenize and normalize
def tokenize_normalize(string):
  return normalize(spacy_tokenize(string))

In [14]:
from sklearn.feature_extraction.text import CountVectorizer #One hot encoding
from sklearn.feature_extraction.text import TfidfVectorizer #Bag of words

In [16]:
one_hot_vectorizer = CountVectorizer(tokenizer = tokenize_normalize, binary = True)
one_hot_vectorizer.fit(train_data_X)
train_features_one_hot = one_hot_vectorizer.transform(train_data_X)
validation_features_one_hot = one_hot_vectorizer.transform(validation_data_X)
test_features_one_hot = one_hot_vectorizer.transform(test_data_X)

In [17]:
tfidf_vectorizer = TfidfVectorizer(tokenizer = tokenize_normalize)
tfidf_vectorizer.fit(train_data_X)
train_features_tfidf = tfidf_vectorizer.transform(train_data_X)
validation_features_tfidf = tfidf_vectorizer.transform(validation_data_X)
test_features_tfidf = tfidf_vectorizer.transform(test_data_X)

**Trying different models to select the best one**

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [18]:
logreg_onehot = LogisticRegression()
logreg_onehot.fit(train_features_one_hot,train_data_Y)
svc_one_hot = SVC(kernel = "rbf")
svc_one_hot.fit(train_features_one_hot,train_data_Y)
nb_one_hot = MultinomialNB(alpha = 0.01, fit_prior = True)
nb_one_hot.fit(train_features_one_hot,train_data_Y)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [19]:
logreg_tfidf = LogisticRegression()
logreg_tfidf.fit(train_features_tfidf,train_data_Y)
svc_tfidf = SVC(kernel = "rbf")
svc_tfidf.fit(train_features_tfidf,train_data_Y)
nb_tfidf = MultinomialNB(alpha = 0.01, fit_prior = True)
nb_tfidf.fit(train_features_tfidf,train_data_Y)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [21]:
validation_log_oh = logreg_onehot.predict(validation_features_one_hot)
validation_svc_oh = svc_one_hot.predict(validation_features_one_hot)
validation_nb_oh = nb_one_hot.predict(validation_features_one_hot)
validation_log_tfidf = logreg_tfidf.predict(validation_features_tfidf)
validation_svc_tfidf = svc_tfidf.predict(validation_features_tfidf)
validation_nb_tfidf = nb_tfidf.predict(validation_features_tfidf)

In [26]:
print ("Accuracy using one-hot encoding: \n\n")
print ("Logistic Regression: ",100*accuracy_score(validation_data_Y,validation_log_oh))
print ("Support Vector Machine: ",100*accuracy_score(validation_data_Y,validation_svc_oh))
print ("Multinomial Naive Bayes: ",100*accuracy_score(validation_data_Y,validation_nb_oh))
print ("\n\nAccuracy using bag of words (but without tf-idf scaling - that may be used later): \n\n")
print ("Logistic Regression: ",100*accuracy_score(validation_data_Y,validation_log_tfidf))
print ("Support Vector Machine: ",100*accuracy_score(validation_data_Y,validation_svc_tfidf))
print ("Multinomial Naive Bayes: ",100*accuracy_score(validation_data_Y,validation_nb_tfidf))

Accuracy using one-hot encoding: 


Logistic Regression:  94.97907949790795
Support Vector Machine:  94.56066945606695
Multinomial Naive Bayes:  96.65271966527197


Accuracy using bag of words (but without tf-idf scaling - that may be used later): 


Logistic Regression:  95.39748953974896
Support Vector Machine:  95.81589958158996
Multinomial Naive Bayes:  94.97907949790795


**The best model is Multinomial Naive Bayes with one-hot encoding** 

In [27]:
#Let us check how many it has correctly classified
correctly_classified_in_validation = int(0.9665*len(validation_data))
print ("Correctly classified on validation set: %d/%d"%(correctly_classified_in_validation,len(validation_data)))

Correctly classified on validation set: 230/239


In [28]:
#Let us try on test data
test_nb_oh = nb_one_hot.predict(test_features_one_hot)
print ("Accuracy on test set: ",100*accuracy_score(test_data_Y,test_nb_oh))

Accuracy on test set:  98.3221476510067


In [29]:
correctly_classified_in_test = int(0.9832*len(test_data))
print ("Correctly classified on test set: %d/%d"%(correctly_classified_in_test,len(test_data)))

Correctly classified on test set: 292/298


In [30]:
print (classification_report(test_data_Y,test_nb_oh))

               precision    recall  f1-score   support

     business       0.99      0.97      0.98        72
entertainment       0.98      0.96      0.97        47
     politics       1.00      1.00      1.00        60
        sport       1.00      1.00      1.00        69
         tech       0.94      0.98      0.96        50

     accuracy                           0.98       298
    macro avg       0.98      0.98      0.98       298
 weighted avg       0.98      0.98      0.98       298



**Lets try our model on a completely new article from BBC News on
Leicester City's match against Shefield United on 16.07.2020**


In [33]:
article = """Leicester boss Brendan Rodgers challenged his team to "make more history" after victory over Sheffield United secured European football for next season.

A first-half strike by Ayoze Perez and Demarai Gray's counter-attacking goal ensured the Foxes stay fourth, a point behind third-placed Chelsea with two games remaining.

Their win also means they qualified for next season's Europa League at least.

But Rodgers wants Leicester to secure Champions League football for a second time after becoming Premier League title-winners in 2016.

"The result gives us the second-highest Premier League finish in Leicester's history. We have two games to go to create some more history," he said.

"We all sat down at the start of season and said, it would be huge challenge for Leicester to get into Europe, so do that in my first season here, to guarantee that is a phenomenal achievement."

The hosts, who dominated most of the game, went ahead when impressive 19-year-old debutant Luke Thomas crossed low to set up Perez after 29 minutes.

They should have extended their lead after the break when Harvey Barnes was twice denied by United goalkeeper Dean Henderson after clever through balls by Jamie Vardy.

The Foxes' top scorer also hit the post, and it looked as though Sheffield United might be fortunate enough to equalise when Jack O'Connell's shot was tipped wide by Kasper Schmeichel.

But, as the game hung in the balance, substitute Gray, on for Barnes, collected Vardy's pass and beat Henderson to finally seal the win after 79 minutes.

Leicester's victory was only their second in six games since the restart but it was a timely boost for Rodgers' team, who are ahead of fifth-placed Manchester United on goal difference.

Blades boss Chris Wilder was fuming with his players afterwards as the result ended a four-game unbeaten run which included wins over Tottenham, Wolves and Chelsea.

They remain eighth, two points behind sixth-placed Wolves, with the prospect of European football next season now out of their hands.

The hosts have struggled since the Premier League's restart, but this was a vastly improved display after their chastening 4-1 loss at relegation-threatened Bournemouth and it was engineered by an impressive blend of youth and experience in their side.

Their biggest threat in the first half came down the left side where Barnes's pace was boosted by some fine crossing by England under-19 international Thomas, who had already picked out Perez for a close-range header before he supplied the Spaniard's eighth goal of the season.

Filling in for the injured Ben Chilwell, who has been linked with a move to Chelsea, Thomas showed his manager Rodgers that he is a more than capable back-up.

Led by the impressive Youri Tielemans, who orchestrated Leicester's midfield to out-manoeuvre the visitors, the 2016 Premier League champions could have been further ahead before the break but for Henderson thwarting both Vardy and Perez.

Having bossed the first half, they were then forced to play on the counter attack after the break following a triple half-time substitution by Wilder.

That shift played into the hands of Vardy, who showed his guile at the age of 33 to twice set up Barnes. The midfielder really should have capitalised.

It felt like they could be crucial misses as the Blades pressed for an equaliser, but it was Barnes' replacement Gray who had the required composure as he collected Vardy's clever lay-off and steered into the net for his third of the season.

It was ample reward for a display which bodes well for their remaining games this season and another potential Champions League campaign next season.

Whether Sheffield United play European football next season or not, it has already been a brilliant campaign for Wilder's team, their first back in the top flight after a 12-season gap.

The fact they shook off a poor start when the Premier League resumed to earn 10 points from 12 in their previous four matches was a testament to their durability and craft.

But after three wins over teams above them, they found Leicester a more difficult task.

Although they made a good start where David McGoldrick fired over from George Baldock's cross, they were second best for much of the first half as Leicester's players, set up in a 3-4-3 formation, continually asked questions of the Blades' resolute defence.

Much of that was down to Tielemans. Even though the visitors had an extra man in central midfield, they could not get near the excellent Belgian.

That led to Wilder making a triple substitution as John Lundstram, John Fleck and Lys Mousset replaced Sander Berge, Ben Osborn and the ineffective McGoldrick.

It led to a slight improvement, but O'Connell's effort after 63 minutes following John Egan's nod back from a corner was their first shot on target.

The Blades continued to probe, although their best efforts came from set-pieces and, as they left gaps in their defence, it was Vardy's threat was which eventually their undoing.
"""

In [34]:
article

'Leicester boss Brendan Rodgers challenged his team to "make more history" after victory over Sheffield United secured European football for next season.\n\nA first-half strike by Ayoze Perez and Demarai Gray\'s counter-attacking goal ensured the Foxes stay fourth, a point behind third-placed Chelsea with two games remaining.\n\nTheir win also means they qualified for next season\'s Europa League at least.\n\nBut Rodgers wants Leicester to secure Champions League football for a second time after becoming Premier League title-winners in 2016.\n\n"The result gives us the second-highest Premier League finish in Leicester\'s history. We have two games to go to create some more history," he said.\n\n"We all sat down at the start of season and said, it would be huge challenge for Leicester to get into Europe, so do that in my first season here, to guarantee that is a phenomenal achievement."\n\nThe hosts, who dominated most of the game, went ahead when impressive 19-year-old debutant Luke Th

In [38]:
article_features = one_hot_vectorizer.transform([article])

In [40]:
article_prediction = nb_one_hot.predict(article_features)
article_prediction

array(['sport'], dtype='<U13')

**It is being correctly predicted as sport**