### Importing all relevant packages

In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
from pprint import pprint

In [4]:
from collections import Counter

In [47]:
from collections import OrderedDict

In [5]:
from nltk.corpus import stopwords
stopwords = list(stopwords.words('english'))

In [64]:
spacy_stopwords = list(spacy.lang.en.stop_words.STOP_WORDS)

In [92]:
spacy_stopwords.extend(stopwords)
spacy_stopwords = list(set(spacy_stopwords))

In [76]:
spacy_stopwords.index('us')

113

In [89]:
spacy_stopwords.extend(['got','told','said','tell','mr','don'])

In [6]:
import numpy as np

In [7]:
import spacy

In [8]:
nlp = spacy.load('en')

In [9]:
import pickle

In [235]:
from sklearn.metrics import accuracy_score

### Understanding the data and categories

BBC articles data taken from here
* https://www.kaggle.com/yufengdev/bbc-text-categorization

In [10]:
data = pd.read_csv('../Data/bbc-text.csv',encoding='latin1',index_col=False)

In [11]:
data.shape

(2225, 2)

In [12]:
data.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [13]:
categories = data.category.unique()

In [14]:
data.category.value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

### Pre-process Text Content

In [22]:
def bigrams(input_list):
    bigrams_list = list(zip(input_list,input_list[1:]))
    bigrams_list_2 = [element for element in bigrams_list if element !=('STOP','START') and element !=('START','STOP')]
    return bigrams_list_2

In [152]:
def preprocess_text(text):
    doc = nlp(text)
    sentences = [each_sent for each_sent in doc.sents]
    lol_words = [
    [
        str(each_token.lemma_) for each_token in nlp(each_sent.text) 
        if str(each_token) not in spacy_stopwords and each_token.is_punct==False and each_token.is_space==False
    ] 
    for each_sent in doc.sents
    ]
    lol_words_proper = [['START']+each_sent+['STOP'] for each_sent in lol_words]
    lol_words_proper_unfurl = [each_word for each_sent in lol_words_proper for each_word in each_sent]
    bigrams_list = bigrams(lol_words_proper_unfurl)
    unigrams_list = [element for element in lol_words_proper_unfurl if element!='START' and element!='STOP']
    return (unigrams_list,bigrams_list)

### Training and testing on the whole data

###### Splitting the data into train and test data

In [219]:
train,test = train_test_split(data,test_size=0.1)

In [220]:
train.head()

Unnamed: 0,category,text
1116,business,stormy year for property insurers a string of ...
2,sport,tigers wary of farrell gamble leicester say ...
2039,entertainment,russian film wins bbc world prize russian dram...
1110,tech,what high-definition will do to dvds first it ...
925,business,man utd to open books to glazer manchester uni...


In [221]:
test.head()

Unnamed: 0,category,text
2186,business,battered dollar hits another low the dollar ha...
1415,tech,row brewing over peer-to-peer ads music downlo...
987,entertainment,john peel replacement show begins the permanen...
577,sport,hearts of oak 3-2 cotonsport hearts of oak set...
1676,politics,gurkhas to help tsunami victims britain has of...


In [222]:
train.reset_index(drop=True,inplace=True)

In [223]:
test.reset_index(drop=True,inplace=True)

###### Calculate bigram counts for each class (training)

In [224]:
count_bigrams_cat = {}
count_unigrams_cat = {}
vocab_size_cat = {}

In [225]:
def training_bigrams(category,df):
    bigrams_list = []
    unigrams_list = []
    ans = [preprocess_text(text) for text in df.text]
    for each in ans:
        bigrams_list.extend(each[1])
        unigrams_list.extend(each[0])
    vocab_size_cat[category] = len(list(set(unigrams_list)))
    count_bigrams_cat[category] = Counter(bigrams_list)
    count_unigrams_cat[category] = Counter(unigrams_list)

###### Assigning class based on highest log-probability score on new dataset (.predict function equivalent)

<img src="images/add delta smoothed conditional prob.png" style="width:150;height:150px;">

In [226]:
def bigram_markov(new_sent):
    ans = preprocess_text(new_sent)
    bigrams_list = ans[1]
    unigrams_list = ans[0]
    markov_prob = {}
    for cat in categories:
        markov_prob[cat] = 0
        for each in bigrams_list:
            markov_prob[cat] += np.log((count_bigrams_cat[cat][each]+0.1)/(count_unigrams_cat[cat][each[0]]+0.1*vocab_size_cat[cat]))
        if len(new_sent.split())>0:
            markov_prob[cat] = markov_prob[cat]/len(new_sent.split())
    markov_prob_final = sorted(markov_prob.items(),key=lambda x:x[1], reverse=True)
    return markov_prob_final

In [137]:
#data_test = data.copy()
#data_test = data_test.sample(n=100)

In [138]:
#data_test.category.value_counts()

In [227]:
%%time
# training now on the train data
train.groupby('category').apply(lambda df: training_bigrams(df.name,df))

Wall time: 6min 18s


In [228]:
# pickle dump for quicker reuse later
vocab_size_cat_f = open("../Data/vocab_size_train.pkl",'wb')
count_bigrams_cat_f = open("../Data/bigram_count_train.pkl",'wb')
count_unigrams_cat_f = open("../Data/unigrams_count_train.pkl",'wb')

pickle.dump(vocab_size_cat,vocab_size_cat_f)
pickle.dump(count_bigrams_cat,count_bigrams_cat_f)
pickle.dump(count_unigrams_cat,count_unigrams_cat_f)

vocab_size_cat_f.close()
count_bigrams_cat_f.close()
count_unigrams_cat_f.close()

In [46]:
# vocab_size_cat = pickle.load("../Data/vocab_size.pkl")
# count_bigrams_cat = pickle.load("../Data/vocab_size.pkl")
# count_unigrams_cat = pickle.load("../Data/vocab_size.pkl")

###### A quick look at the Bigram Markov Model Features

In [246]:
print(vocab_size_cat)
print("****")
pprint(list(zip(categories,[count_bigrams_cat[cat].most_common(5) for cat in categories])))
print("****")
pprint(list(zip(categories,[count_unigrams_cat[cat].most_common(5) for cat in categories])))

{'business': 9283, 'entertainment': 8898, 'politics': 8317, 'sport': 8083, 'tech': 8765}
****
[('tech',
  [(('e', 'mail'), 178),
   (('mobile', 'phone'), 154),
   (('year', 'STOP'), 135),
   (('START', 'people'), 92),
   (('bbc', 'news'), 85)]),
 ('business',
  [(('year', 'STOP'), 236),
   (('START', 'company'), 122),
   (('chief', 'executive'), 117),
   (('START', 'analyst'), 96),
   (('interest', 'rate'), 93)]),
 ('sport',
  [(('game', 'STOP'), 151),
   (('START', 'think'), 145),
   (('START', 'know'), 107),
   (('time', 'STOP'), 101),
   (('year', 'STOP'), 97)]),
 ('entertainment',
  [(('year', 'STOP'), 133),
   (('START', 'film'), 96),
   (('award', 'STOP'), 94),
   (('film', 'STOP'), 84),
   (('box', 'office'), 79)]),
 ('politics',
  [(('prime', 'minister'), 250),
   (('START', 'blair'), 175),
   (('START', 'tory'), 165),
   (('tony', 'blair'), 161),
   (('lib', 'dem'), 160)])]
****
[('tech',
  [('people', 849),
   ('game', 652),
   ('mobile', 575),
   ('technology', 567),
   ('ph

###### Testing on some sample sentences

In [229]:
bigram_markov("economic growth slows down")

[('business', -3.9692388514802905),
 ('politics', -6.7677569438805945),
 ('tech', -8.341093396600021),
 ('sport', -9.000286606679415),
 ('entertainment', -9.09610496174359)]

In [230]:
bigram_markov("prime minister")

[('politics', -3.5746871267381852),
 ('business', -7.440089097987748),
 ('tech', -10.252850263607193),
 ('sport', -11.981405120960808),
 ('entertainment', -12.124273481509913)]

In [231]:
bigram_markov("halo was hugely successful")

[('tech', -6.2126216117651065),
 ('politics', -7.672317862538664),
 ('sport', -8.01886399230171),
 ('business', -8.157525044723988),
 ('entertainment', -8.502447051556878)]

###### Testing on the whole dataset and measuring accuracy

In [247]:
test.head()

Unnamed: 0,category,text
0,business,battered dollar hits another low the dollar ha...
1,tech,row brewing over peer-to-peer ads music downlo...
2,entertainment,john peel replacement show begins the permanen...
3,sport,hearts of oak 3-2 cotonsport hearts of oak set...
4,politics,gurkhas to help tsunami victims britain has of...


In [254]:
def bigram_markov_final(new_sent):
    ans = preprocess_text(new_sent)
    bigrams_list = ans[1]
    unigrams_list = ans[0]
    markov_prob = {}
    for cat in categories:
        markov_prob[cat] = 0
        for each in bigrams_list:
            markov_prob[cat] += np.log((count_bigrams_cat[cat][each]+0.1)/(count_unigrams_cat[cat][each[0]]+0.1*vocab_size_cat[cat]))
        if len(new_sent.split())>0:
            markov_prob[cat] = markov_prob[cat]/len(new_sent.split())
    markov_prob_final = sorted(markov_prob.items(),key=lambda x:x[1], reverse=True)
    return markov_prob_final[0][0]

In [255]:
test.index

RangeIndex(start=0, stop=223, step=1)

In [258]:
%%time
test['predicted'] = test.apply(lambda x: bigram_markov_final(x.text),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Wall time: 44.5 s


In [259]:
test.head()

Unnamed: 0,category,text,predicted
0,business,battered dollar hits another low the dollar ha...,business
1,tech,row brewing over peer-to-peer ads music downlo...,tech
2,entertainment,john peel replacement show begins the permanen...,entertainment
3,sport,hearts of oak 3-2 cotonsport hearts of oak set...,sport
4,politics,gurkhas to help tsunami victims britain has of...,politics


In [260]:
print(accuracy_score(test['category'],test['predicted']))

0.968609865470852


97% accuracy for prediction of category

###### Appendix - Understanding the bigram features

In [99]:
count_bigrams_cat.keys()

dict_keys(['business', 'entertainment', 'politics', 'sport', 'tech'])

###### Bigram Features

In [233]:
df = pd.DataFrame([count_bigrams_cat['tech'].most_common(50),count_bigrams_cat['sport'].most_common(50),count_bigrams_cat['entertainment'].most_common(50),count_bigrams_cat['business'].most_common(50),count_bigrams_cat['politics'].most_common(50)]).T
df.columns = ['tech','sport','entertainment','business','politics']
df

Unnamed: 0,tech,sport,entertainment,business,politics
0,"((e, mail), 178)","((game, STOP), 151)","((year, STOP), 133)","((year, STOP), 236)","((prime, minister), 250)"
1,"((mobile, phone), 154)","((START, think), 145)","((START, film), 96)","((START, company), 122)","((START, blair), 175)"
2,"((year, STOP), 135)","((START, know), 107)","((award, STOP), 94)","((chief, executive), 117)","((START, tory), 165)"
3,"((START, people), 92)","((time, STOP), 101)","((film, STOP), 84)","((START, analyst), 96)","((tony, blair), 161)"
4,"((bbc, news), 85)","((year, STOP), 97)","((box, office), 79)","((interest, rate), 93)","((lib, dem), 160)"
5,"((START, microsoft), 82)","((season, STOP), 97)","((START, think), 63)","((START, firm), 90)","((general, election), 158)"
6,"((game, STOP), 80)","((START, good), 96)","((START, year), 62)","((market, STOP), 84)","((liberal, democrat), 148)"
7,"((START, mobile), 79)","((START, england), 88)","((new, york), 54)","((stock, market), 75)","((START, labour), 148)"
8,"((START, game), 76)","((grand, slam), 88)","((million, dollar), 52)","((economic, growth), 73)","((election, STOP), 146)"
9,"((website, STOP), 75)","((START, play), 87)","((START, good), 52)","((2004, STOP), 69)","((START, howard), 119)"


###### Unigram Features

In [234]:
df = pd.DataFrame([count_unigrams_cat['tech'].most_common(50),count_unigrams_cat['sport'].most_common(50),count_unigrams_cat['entertainment'].most_common(50),count_unigrams_cat['business'].most_common(50),count_unigrams_cat['politics'].most_common(50)]).T
df.columns = ['tech','sport','entertainment','business','politics']
df

Unnamed: 0,tech,sport,entertainment,business,politics
0,"(people, 849)","(game, 582)","(film, 807)","(year, 828)","(government, 710)"
1,"(game, 652)","(play, 538)","(good, 643)","($, 730)","(labour, 706)"
2,"(mobile, 575)","(win, 534)","(year, 548)","(company, 575)","(party, 676)"
3,"(technology, 567)","(good, 467)","(award, 483)","(firm, 488)","(election, 625)"
4,"(phone, 483)","(time, 433)","(star, 406)","(market, 461)","(people, 596)"
5,"(new, 474)","(year, 430)","(music, 398)","(rise, 407)","(minister, 521)"
6,"(year, 470)","(player, 424)","(include, 307)","(sale, 384)","(blair, 503)"
7,"(service, 460)","(england, 411)","(new, 291)","(bank, 372)","(tory, 493)"
8,"(user, 437)","(world, 372)","(actor, 231)","(growth, 356)","(plan, 444)"
9,"(use, 394)","(6, 358)","(band, 229)","(economy, 355)","(year, 414)"
