# Naïve Bayes text classification
# Reuters-21578 Text Categorization Collection Data Set
<br>Data Link- http://archive.ics.uci.edu/ml/datasets/Reuters-21578+Text+Categorization+Collection.

**Importing Necessary Packages**

In [1]:
import pandas as pd
import numpy as np
import sklearn
import chardet
import os
import re
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


**Reading all 22 sgm files of the data**

In [2]:
files = ["/home/rohan/CMI/SEM_2/DMML/Assignment_2/reuters21578/reut2-%03d.sgm" % r for r in range(0, 22)]

# Preprocessing the SGML data

**Regex codes to extract actual body from all the sgm files**

In [3]:
t=re.compile(r'<TEXT[\S\s]*?>[\S\s]*?</TEXT>') 
t1=re.compile(r'<TEXT>')
t2=re.compile(r'<TEXT TYPE="UNPROC">')
t3=re.compile(r'<TEXT TYPE="BRIEF">')
p=re.compile(r'&#2;([\S\s]*?)&#3')
q=re.compile(r'<BODY>([\S\s]*?)</BODY>')
r=re.compile(r'<TITLE>([\S\s]*?)</TITLE>')
s=re.compile(r'<TOPICS>([\s\S]*?)</TOPICS>')

In [4]:
body=[]  #Storing body parts as defined in the ReadME document of the data
for j in range(len(files)):
    f=open(files[j])
    data=f.read()
    n=t.findall(data)
    for i in n:
        if t1.findall(i)!=[]:
            body=body+(q.findall(i))
        elif t2.findall(i)!=[]:
            body=body+(p.findall(i))
        elif t3.findall(i)!=[]:
            body=body+(r.findall(i))
        
len(body)        

21578

#### Extracting topics from the sgm files

In [5]:
topic=[]  #storing all the topic names(None for no topic names)
import re
u=re.compile(r"<TOPICS(>[<D>(\S)</D>]*<)/TOPICS>")
v=re.compile(r"<D>([\w-]*)</D>")
for j in range(len(files)):
    f=open(files[j])
    data=f.read()
    m = u.findall(data)
    for x in m:
        if x=='><':
            topic.append(["None"])
        else:
            topic.append(v.findall(x))
len(topic)

21578

#### Extracting CGI Train/test split from the sgm files

In [6]:
CGI=[]
z=re.compile(r'CGISPLIT=(\S+)')
for  j in range(len(files)):
    f=open(files[j])
    data=f.read()
    CGI+=z.findall(data)
len(CGI)  

21578

#### Extracting LEWIS Train/test split from the sgm files

In [7]:
LEWIS=[]
z=re.compile(r'LEWISSPLIT=(\S+)')
for  j in range(len(files)):
    f=open(files[j])
    data=f.read()
    LEWIS+=z.findall(data)
len(LEWIS)  

21578

#### Creating Dataframe df consisting columns obtained from lists topics,body,CGI,LEWIS

In [8]:
import pandas as pd
df=pd.DataFrame(list(zip(topic,body,CGI,LEWIS)),
              columns=['topics','body','CGI','LEWIS'])

In [9]:
df.head()

Unnamed: 0,topics,body,CGI,LEWIS
0,[cocoa],Showers continued throughout the week in\nthe ...,"""TRAINING-SET""","""TRAIN"""
1,[None],Standard Oil Co and BP North America\nInc said...,"""TRAINING-SET""","""TRAIN"""
2,[None],Texas Commerce Bancshares Inc's Texas\nCommerc...,"""TRAINING-SET""","""TRAIN"""
3,[None],BankAmerica Corp is not under\npressure to act...,"""TRAINING-SET""","""TRAIN"""
4,"[grain, wheat, corn, barley, oat, sorghum]",The U.S. Agriculture Department\nreported the ...,"""TRAINING-SET""","""TRAIN"""


#### Dropping the rows which have null values in the Topics column of df

In [10]:
for i in range(len(df['topics'])):
    if df['topics'][i]==['None']:
        df['topics'][i]=None
df = df.dropna(how='any',axis=0) 
df.reset_index(drop=True, inplace=True)

In [11]:
df['CGI'].unique()
df['LEWIS'].unique()

array(['"TRAINING-SET"', '"PUBLISHED-TESTSET"'], dtype=object)

array(['"TRAIN"', '"NOT-USED"', '"TEST"'], dtype=object)

#### Creating a new dataframe named same as the previous dataframe(df) <br>
Here different topics in a list of 'topics' column in the old dataframe are separated in different rows in the new dataframe and body is repeated for each of them in the same list

In [12]:
new_top=[]
new_body=[]
new_CGI=[]
new_LEWIS=[]
for i in range(len(topic)):
    for j in topic[i]:
        new_top.append(j)
        new_body.append(body[i])
        new_CGI.append(CGI[i])
        new_LEWIS.append(LEWIS[i])

In [13]:
df=pd.DataFrame(list(zip(new_top,new_body,new_CGI,new_LEWIS)),
              columns=['Topics','Body','CGI','LEWIS'])

In [14]:
df.head()
df.shape

Unnamed: 0,Topics,Body,CGI,LEWIS
0,cocoa,Showers continued throughout the week in\nthe ...,"""TRAINING-SET""","""TRAIN"""
1,,Standard Oil Co and BP North America\nInc said...,"""TRAINING-SET""","""TRAIN"""
2,,Texas Commerce Bancshares Inc's Texas\nCommerc...,"""TRAINING-SET""","""TRAIN"""
3,,BankAmerica Corp is not under\npressure to act...,"""TRAINING-SET""","""TRAIN"""
4,grain,The U.S. Agriculture Department\nreported the ...,"""TRAINING-SET""","""TRAIN"""


(24513, 4)

#### Dropping the rows which have null values in the Topics column of df

In [15]:
for i in range(len(df['Topics'])):
    if df['Topics'][i]=='None':
        df['Topics'][i]=None
df= df.dropna(how='any',axis=0) 
df.reset_index(drop=True, inplace=True)
df.shape

(14302, 4)

#### Dropping duplicate rows of df

In [16]:
df.drop_duplicates(subset = "Body" ,inplace = True) 
df.reset_index(drop=True, inplace=True)
df.head()
df.shape

Unnamed: 0,Topics,Body,CGI,LEWIS
0,cocoa,Showers continued throughout the week in\nthe ...,"""TRAINING-SET""","""TRAIN"""
1,grain,The U.S. Agriculture Department\nreported the ...,"""TRAINING-SET""","""TRAIN"""
2,veg-oil,Argentine grain board figures show\ncrop regis...,"""TRAINING-SET""","""TRAIN"""
3,earn,Champion Products Inc said its\nboard of direc...,"""TRAINING-SET""","""TRAIN"""
4,acq,Computer Terminal Systems Inc said\nit has com...,"""TRAINING-SET""","""TRAIN"""


(11230, 4)

#### Importing Necessary Packages

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Label encoding of Topics 

In [18]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()

df["Topic_code"] = lb_make.fit_transform(df["Topics"])

In [19]:
df.head()

Unnamed: 0,Topics,Body,CGI,LEWIS,Topic_code
0,cocoa,Showers continued throughout the week in\nthe ...,"""TRAINING-SET""","""TRAIN""",6
1,grain,The U.S. Agriculture Department\nreported the ...,"""TRAINING-SET""","""TRAIN""",24
2,veg-oil,Argentine grain board figures show\ncrop regis...,"""TRAINING-SET""","""TRAIN""",76
3,earn,Champion Products Inc said its\nboard of direc...,"""TRAINING-SET""","""TRAIN""",17
4,acq,Computer Terminal Systems Inc said\nit has com...,"""TRAINING-SET""","""TRAIN""",0


# Train-test split for Body and Topic-code of the dataframe

In [20]:
from sklearn.model_selection import train_test_split

X = df['Body']
y = df['Topic_code']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17)

print("Shape of X is {}".format(X.shape))
print("Shape of X_train is {} and shape of y_train is {}".format(X_train.shape, y_train.shape))
print("Shape of X_test is {} and shape of y_test is {}".format(X_test.shape, y_test.shape))

Shape of X is (11230,)
Shape of X_train is (8422,) and shape of y_train is (8422,)
Shape of X_test is (2808,) and shape of y_test is (2808,)


## Classification using Multinomial Naive Bayes Model

In [21]:
from sklearn.naive_bayes import MultinomialNB

model= MultinomialNB()
pipeline_Mnv = Pipeline([('vectorizer', TfidfVectorizer()),
                     ('classifier', model)])

pipeline_Mnv.fit(X_train, y_train)
y_predicted_Mnv = pipeline_Mnv.predict(X_test)
accuracy_Mnv = accuracy_score(y_test, y_predicted_Mnv)
#precision, recall, f1_score, _ = precision_recall_fscore_support(y_te,y_predicted_BNB)
accuracy_Mnv

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=...      vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

0.6456552706552706

In [22]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_predicted_Mnv,y_test)


array([[633,  10,   1, ...,   2,   0,   3],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0]])

## Classification using Complement Naive Bayes Model

In [23]:
from sklearn.naive_bayes import ComplementNB
model = ComplementNB()

pipeline_Cnv = Pipeline([('vectorizer', TfidfVectorizer()),
                     ('classifier', model)])

pipeline_Cnv.fit(X_train, y_train)
y_predicted_Cnv= pipeline_Cnv.predict(X_test)
accuracy_Cnv = accuracy_score(y_test, y_predicted_Cnv)
accuracy_Cnv

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=...ulary=None)), ('classifier', ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False))])

0.8190883190883191

In [24]:
confusion_matrix(y_predicted_Cnv,y_test)

array([[630,   5,   0, ...,   0,   0,   2],
       [  0,   8,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   1,   0],
       [  0,   0,   0, ...,   0,   0,   0]])

# Separation of train and test data (CGISPLIT) 

In [25]:
df_trainC=df.loc[df['CGI'] == '"TRAINING-SET"']
df_testC=df.loc[df['CGI'] == '"PUBLISHED-TESTSET"']

In [26]:
X_trC=df_trainC['Body']
y_trC=df_trainC['Topic_code']
X_teC=df_testC['Body']
y_teC=df_testC['Topic_code']

# Classification using Multinomial Naive Bayes Model (CGISPLIT)

In [27]:
from sklearn.naive_bayes import MultinomialNB

model= MultinomialNB()
pipeline_Mnv = Pipeline([('vectorizer', TfidfVectorizer()),
                     ('classifier', model)])

pipeline_Mnv.fit(X_trC, y_trC)
y_predicted_Mnv = pipeline_Mnv.predict(X_teC)
accuracy_Mnv = accuracy_score(y_teC, y_predicted_Mnv)
#precision, recall, f1_score, _ = precision_recall_fscore_support(y_te,y_predicted_BNB)
accuracy_Mnv

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=...      vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

0.32299270072992703

In [28]:
confusion_matrix(y_predicted_Mnv,y_teC)

array([[74,  2,  1, ...,  9,  0,  3],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]])

# Classification using Complement Naive Bayes Model (CGISPLIT)

In [29]:
from sklearn.naive_bayes import ComplementNB
model = ComplementNB()

pipeline_Cnv = Pipeline([('vectorizer', TfidfVectorizer()),
                     ('classifier', model)])

pipeline_Cnv.fit(X_trC, y_trC)
y_predicted_Cnv= pipeline_Cnv.predict(X_teC)
accuracy_Cnv = accuracy_score(y_teC, y_predicted_Cnv)
accuracy_Cnv

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=...ulary=None)), ('classifier', ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False))])

0.6459854014598541

In [30]:
confusion_matrix(y_predicted_Cnv,y_teC)

array([[75,  0,  0, ...,  0,  0,  2],
       [ 0,  1,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  2,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]])

# Separation of train and test data (LEWISSPLIT) 

In [31]:
df_trainL=df.loc[df['LEWIS'] == '"TRAIN"']
df_testL=df.loc[df['LEWIS'] == '"TEST"']

In [32]:
X_trL=df_trainL['Body']
y_trL=df_trainL['Topic_code']
X_teL=df_testL['Body']
y_teL=df_testL['Topic_code']

# Classification using Multinomial Naive Bayes Model (LEWISSPLIT)

In [33]:
model= MultinomialNB()
pipeline_Mnv = Pipeline([('vectorizer', TfidfVectorizer()),
                     ('classifier', model)])

pipeline_Mnv.fit(X_trL, y_trL)
y_predicted_Mnv = pipeline_Mnv.predict(X_teL)
accuracy_Mnv = accuracy_score(y_teL, y_predicted_Mnv)
#precision, recall, f1_score, _ = precision_recall_fscore_support(y_te,y_predicted_BNB)
accuracy_Mnv

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=...      vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

0.643023643023643

In [34]:
confusion_matrix(y_predicted_Mnv,y_teL)

array([[692,   9,   1, ...,   0,   2,   5],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0]])

# Classification using Complement Naive Bayes Model (LEWISSPLIT)

In [35]:
model = ComplementNB()

pipeline_Cnv = Pipeline([('vectorizer', TfidfVectorizer()),
                     ('classifier', model)])

pipeline_Cnv.fit(X_trL, y_trL)
y_predicted_Cnv= pipeline_Cnv.predict(X_teL)
accuracy_Cnv = accuracy_score(y_teL, y_predicted_Cnv)
accuracy_Cnv

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=...ulary=None)), ('classifier', ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False))])

0.8191808191808192

In [36]:
confusion_matrix(y_predicted_Cnv,y_teL)

array([[701,   5,   0, ...,   0,   0,   1],
       [  0,   8,   0, ...,   0,   0,   2],
       [  0,   0,   0, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,   1,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0]])

## Multi-label Classification using train-test split from sklearn.model_selection

In [37]:
from skmultilearn.problem_transform import LabelPowerset
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['Topics'])

In [38]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split( df['Body'] , y , random_state = 52)

tfidf_transformer = TfidfVectorizer(token_pattern = r'\b[^\d\W]+\b')
tfidf_transformer.fit(X_train)
X_train_tfidf=tfidf_transformer.transform(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='\\b[^\\d\\W]+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

### Using Complement Naive Bayes Model

In [39]:
classifier = LabelPowerset(ComplementNB())
classifier.fit(X_train_tfidf,y_train)

LabelPowerset(classifier=ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False),
       require_dense=[True, True])

In [40]:
X_test_tfidf=tfidf_transformer.transform(X_test)
pred = classifier.predict(X_test_tfidf)

In [41]:
print(accuracy_score(y_test,pred.toarray()))

0.8379629629629629


### Using Multinomial Naive Bayes Model

In [42]:
classifier = LabelPowerset(MultinomialNB())
classifier.fit(X_train_tfidf,y_train)
X_test_tfidf=tfidf_transformer.transform(X_test)
pred = classifier.predict(X_test_tfidf)
print(accuracy_score(y_test,pred.toarray()))

LabelPowerset(classifier=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       require_dense=[True, True])

0.6445868945868946


# Multi-label Classification using CGISPLIT

In [43]:
y_teC_t= mlb.fit_transform(df.loc[df['CGI'] == '"PUBLISHED-TESTSET"']['Topics'])
y_trC_t= mlb.fit_transform(df.loc[df['CGI'] == '"TRAINING-SET"']['Topics'])


In [44]:
tfidf_transformer.fit(X_trC)
X_train_tfidf=tfidf_transformer.transform(X_trC)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='\\b[^\\d\\W]+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [45]:
X_train_tfidf.shape
y_trC_t.shape

(10682, 25967)

(10682, 27)

### Using Multinomial Naive Bayes Model

In [46]:
classifier = LabelPowerset(MultinomialNB())
classifier.fit(X_train_tfidf,y_trC_t)
X_test_tfidf=tfidf_transformer.transform(X_teC)
pred = classifier.predict(X_test_tfidf)
print(accuracy_score(y_teC_t,pred.toarray()))

LabelPowerset(classifier=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       require_dense=[True, True])

0.33211678832116787


### Using Complement Naive Bayes Model

In [47]:
classifier = LabelPowerset(ComplementNB())
classifier.fit(X_train_tfidf,y_trC_t)
X_test_tfidf=tfidf_transformer.transform(X_teC)
pred = classifier.predict(X_test_tfidf)
print(accuracy_score(y_teC_t,pred.toarray()))

LabelPowerset(classifier=ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False),
       require_dense=[True, True])

0.6642335766423357


# Multi-level Classification using LEWISSPLIT

In [48]:
y_teL_t= mlb.fit_transform(df.loc[df['LEWIS'] == '"TEST"']['Topics'])
y_trL_t= mlb.fit_transform(df.loc[df['LEWIS'] == '"TRAIN"']['Topics'])

In [49]:
tfidf_transformer.fit(X_trL)
X_train_tfidf=tfidf_transformer.transform(X_trL)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='\\b[^\\d\\W]+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [50]:
X_train_tfidf.shape
y_trL_t.shape

(7679, 22133)

(7679, 27)

### Using Multinomial Naive Bayes Model

In [51]:
classifier = LabelPowerset(MultinomialNB())
classifier.fit(X_train_tfidf,y_trL_t)

X_test_tfidf=tfidf_transformer.transform(X_teL)
pred = classifier.predict(X_test_tfidf)

print(accuracy_score(y_teL_t,pred.toarray()))

LabelPowerset(classifier=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       require_dense=[True, True])

0.655011655011655


### Using Complement Naive Bayes Model

In [52]:
classifier = LabelPowerset(ComplementNB())
classifier.fit(X_train_tfidf,y_trL_t)
X_test_tfidf=tfidf_transformer.transform(X_teL)
pred = classifier.predict(X_test_tfidf)
print(accuracy_score(y_teL_t,pred.toarray()))

LabelPowerset(classifier=ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False),
       require_dense=[True, True])

0.8334998334998335
