In [1]:
#Importing the libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
#Download the following modules once
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\10664440\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\10664440\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
#Importing the training set
train_data = pd.read_excel("Data_Train01.xlsx")

In [5]:
#Printing the top 5 rows
print(train_data.head(5))

                                               STORY  SECTION
0  But the most painful was the huge reversal in ...        3
1  How formidable is the opposition alliance amon...        0
2  Most Asian currencies were trading lower today...        3
3  If you want to answer any question, click on ‘...        1
4  In global markets, gold prices edged up today ...        3


In [6]:
#Printing the dataset info
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7628 entries, 0 to 7627
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   STORY    7628 non-null   object
 1   SECTION  7628 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 119.3+ KB
None


In [8]:
#Printing the shape of the dataset
print(train_data.shape)

(7628, 2)


In [9]:
#Printing the group by description of each category
train_data.groupby("SECTION").describe()

Unnamed: 0_level_0,STORY,STORY,STORY,STORY
Unnamed: 0_level_1,count,unique,top,freq
SECTION,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1686,1673,This story has been published from a wire agen...,4
1,2772,2731,This story has been published from a wire agen...,13
2,1924,1914,We will leave no stone unturned to make the au...,3
3,1246,1233,This story has been published from a wire agen...,11


In [10]:
#Removing duplicates to avoid overfitting
train_data.drop_duplicates(inplace = True)

In [11]:
#A punctuations string for reference (added other valid characters from the dataset)
all_punctuations = string.punctuation + '‘’,:”][],' 

In [13]:
#Method to remove punctuation marks from the data
def punc_remover(raw_text):
    no_punct = "".join([i for i in raw_text if i not in all_punctuations])
    return no_punct

In [17]:
#Method to remove stopwords from the data
def stopword_remover(no_punc_text):
    words = no_punc_text.split()
    no_stp_words = " ".join([i for i in words if i not in stopwords.words('english')])
    return no_stp_words

In [19]:
#Method to lemmatize the words in the data
lemmer = nltk.stem.WordNetLemmatizer()
def lem(words):
    return " ".join([lemmer.lemmatize(word,'v') for word in words.split()])

In [20]:
#Method to perform a complete cleaning
def text_cleaner(raw):
    cleaned_text = stopword_remover(punc_remover(raw))
    return lem(cleaned_text)

In [21]:
#Testing the cleaner method
text_cleaner("Hi!, this is a sample text to test the text cleaner method. Removes *@!#special characters%$^* and stopwords. And lemmatizes, go, going - run, ran, running")

'Hi sample text test text cleaner method Removes special character stopwords And lemmatizes go go run run run'

In [25]:
train_data=train_data[0:1000]

In [26]:
train_data.shape

(1000, 2)

In [27]:
#Applying the cleaner method to the entire data
train_data['CLEAN_STORY'] = train_data['STORY'].apply(text_cleaner)

In [29]:
#Checking the new dataset
#print(train_data.values)

In [30]:
#Importing sklearn’s Countvectorizer
from sklearn.feature_extraction.text import CountVectorizer

#Creating a bag-of-words dictionary of words from the data
bow_dictionary = CountVectorizer().fit(train_data['CLEAN_STORY'])

#Total number of words in the bow_dictionary
len(bow_dictionary.vocabulary_)

12176

In [31]:
#Using the bow_dictionary to create count vectors for the cleaned data.
bow = bow_dictionary.transform(train_data['CLEAN_STORY'])

In [32]:
#Printing the shape of the bag of words model
print(bow.shape)

(1000, 12176)


In [33]:
#Importing TfidfTransformer from sklearn
from sklearn.feature_extraction.text import TfidfTransformer

#Fitting the bag of words data to the TF-IDF transformer
tfidf_transformer = TfidfTransformer().fit(bow)

#Transforming the bag of words model to TF-IDF vectors
storytfidf = tfidf_transformer.transform(bow)

In [34]:
storytfidf.shape

(1000, 12176)

In [35]:
#Creating a Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

#Fitting the training data to the classifier
classifier = MultinomialNB().fit(storytfidf, train_data['SECTION'])

In [43]:
#Importing and cleaning the test data
test_data = pd.read_excel("Data_Test01.xlsx")

In [44]:
test_data.shape

(2748, 1)

In [45]:
test_data=test_data[0:700]

In [46]:
test_data['CLEAN_STORY'] = test_data['STORY'].apply(text_cleaner)

#Printing the cleaned data
#print(test_data.values)

In [47]:
#Importing the Pipeline module from sklearn
from sklearn.pipeline import Pipeline

In [48]:
#Initializing the pipeline with necessary transformations and the required classifier
pipe = Pipeline([
('bow', CountVectorizer()),
('tfidf', TfidfTransformer()),
('classifier', MultinomialNB())])

In [49]:
#Fitting the training data to the pipeline
pipe.fit(train_data['CLEAN_STORY'], train_data['SECTION'])

Pipeline(steps=[('bow', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB())])

In [50]:
#Predicting the SECTION
test_preds_mnb = pipe.predict(test_data['CLEAN_STORY'])

In [52]:
#Writing the predictions to an excel sheet
pd.DataFrame(test_preds_mnb, columns = ['SECTION']).to_excel("predictions.xlsx")