In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/preranasingh/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/preranasingh/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
train_data = pd.read_excel("/Users/preranasingh/Desktop/Participants_Data_News_category/Data_Train.xlsx")

In [3]:
train_data.head(5)

Unnamed: 0,STORY,SECTION
0,But the most painful was the huge reversal in ...,3
1,How formidable is the opposition alliance amon...,0
2,Most Asian currencies were trading lower today...,3
3,"If you want to answer any question, click on ‘...",1
4,"In global markets, gold prices edged up today ...",3


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7628 entries, 0 to 7627
Data columns (total 2 columns):
STORY      7628 non-null object
SECTION    7628 non-null int64
dtypes: int64(1), object(1)
memory usage: 119.3+ KB


In [5]:
train_data.groupby("SECTION").describe()

Unnamed: 0_level_0,STORY,STORY,STORY,STORY
Unnamed: 0_level_1,count,unique,top,freq
SECTION,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1686,1673,This story has been published from a wire agen...,4
1,2772,2731,This story has been published from a wire agen...,13
2,1924,1914,"The consensus reads, “Exciting, entertaining, ...",3
3,1246,1233,This story has been published from a wire agen...,11


In [6]:
train_data.drop_duplicates(inplace = True)

In [7]:
#A punctuations string for reference (added other valid characters from the dataset)
all_punctuations = string.punctuation + '‘’,:”][],' 

In [8]:
#Method to remove punctuation marks from the data
def punc_remover(raw_text):
    no_punct = "".join([i for i in raw_text if i not in all_punctuations])
    return no_punct

In [9]:
#Method to remove stopwords from the data
def stopword_remover(no_punc_text):
    words = no_punc_text.split()
    no_stp_words = " ".join([i for i in words if i not in stopwords.words('english')])
    return no_stp_words

In [10]:
#Method to lemmatize the words in the data
lemmer = nltk.stem.WordNetLemmatizer()
def lem(words):
    return " ".join([lemmer.lemmatize(word,'v') for word in words.split()])

#Method to perform a complete cleaning
def text_cleaner(raw):
    cleaned_text = stopword_remover(punc_remover(raw))
    return lem(cleaned_text)

In [11]:
#Testing the cleaner method
text_cleaner("Hi!, this is a sample text to test the text cleaner method. Removes *@!#special characters%$^* and stopwords. And lemmatizes, go, going - run, ran, running")

'Hi sample text test text cleaner method Removes special character stopwords And lemmatizes go go run run run'

In [12]:
train_data['CLEAN_STORY'] = train_data['STORY'].apply(text_cleaner)

In [13]:
print(train_data.values)

[['But the most painful was the huge reversal in fee income, unheard of among private sector lenders. Essentially, it means that Yes Bank took it for granted that fees on structured loan deals will be paid and accounted for upfront on its books. As borrowers turned defaulters, the fees tied to these loan deals fell off the cracks. Gill has now vowed to shift to a safer accounting practice of amortizing fee income rather than booking these upfront.\n\n\nGill’s move to mend past ways means that there will be no nasty surprises in the future. This is good news considering that investors love a clean image and loathe uncertainties.\n\n\nBut there is no gain without pain and the promise of a strong and stable balance sheet comes with some sacrifices as well. Investors will have to give up the hopes of phenomenal growth, a promise made by Kapoor.'
  3
  'But painful huge reversal fee income unheard among private sector lenders Essentially mean Yes Bank take grant fee structure loan deal pay 

In [14]:
#Importing sklearn’s Countvectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
#Creating a bag-of-words dictionary of words from the data
bow_dictionary = CountVectorizer().fit(train_data['CLEAN_STORY'])

In [19]:
#Total number of words in the bow_dictionary
print(len(bow_dictionary.vocabulary_))

#Using the bow_dictionary to create count vectors for the cleaned data.
bow = bow_dictionary.transform(train_data['CLEAN_STORY'])

#Printing the shape of the bag of words model
print(bow.shape)

35189
(7551, 35189)


In [20]:
#Importing TfidfTransformer from sklearn
from sklearn.feature_extraction.text import TfidfTransformer

In [22]:
#Fitting the bag of words data to the TF-IDF transformer
tfidf_transformer = TfidfTransformer().fit(bow)

#Transforming the bag of words model to TF-IDF vectors
storytfidf = tfidf_transformer.transform(bow)

In [23]:
#Creating a Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

#Fitting the training data to the classifier
classifier = MultinomialNB().fit(storytfidf, train_data['SECTION'])

In [24]:
#Importing and cleaning the test data
test_data = pd.read_excel("/Users/preranasingh/Desktop/Participants_Data_News_category/Data_Test.xlsx")
test_data['CLEAN_STORY'] = test_data['STORY'].apply(text_cleaner)



In [25]:
#Printing the cleaned data
print(test_data.values)

[['2019 will see gadgets like gaming smartphones and wearable medical devices lifting the user experience to a whole new level\n\n\nmint-india-wire consumer technologyconsumer technology trends in New Yeartech gadgetsFoldable phonesgaming smartphoneswearable medical devicestechnology\n\n\nNew Delhi: Gadgets have become an integral part of our lives with most of us relying on some form of factor to communicate, commute, work, be informed or entertained. Year 2019 will see some gadgets lifting the user experience to a whole new level. Here’s what we can expect to see:\n\n\nSmartphones with foldable screens: Foldable phones are finally moving from the concept stage to commercial launches. They are made up of organic light-emitting diode (OLED) panels with higher plastic substrates, allowing them to be bent without damage.\n\n\nUS-based display maker Royole Corp’s foldable phone, FlexPai, has already arrived in select markets, while Samsung’s unnamed foldable phone is expected sometime nex

In [26]:
#Importing the Pipeline module from sklearn
from sklearn.pipeline import Pipeline

#Initializing the pipeline with necessary transformations and the required classifier
pipe = Pipeline([
('bow', CountVectorizer()),
('tfidf', TfidfTransformer()),
('classifier', MultinomialNB())])

In [27]:
#Fitting the training data to the pipeline
pipe.fit(train_data['CLEAN_STORY'], train_data['SECTION'])

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [28]:
#Predicting the SECTION
test_preds_mnb = pipe.predict(test_data['CLEAN_STORY'])

In [29]:
#Writing the predictions to an excel sheet
pd.DataFrame(test_preds_mnb, columns = ['SECTION']).to_excel("/Users/preranasingh/Desktop/Participants_Data_News_category/predictions.xlsx")