# Predict The News Category

## Exploratory Data Analysis

### Import Libraries

In [18]:
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
from nltk.corpus import stopwords
import string
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/rohit/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/rohit/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

### Import Datasets

In [9]:
train = pd.read_excel('Data_Train.xlsx')
test = pd.read_excel('Data_Test.xlsx')

In [10]:
train.head()

Unnamed: 0,STORY,SECTION
0,But the most painful was the huge reversal in ...,3
1,How formidable is the opposition alliance amon...,0
2,Most Asian currencies were trading lower today...,3
3,"If you want to answer any question, click on ‘...",1
4,"In global markets, gold prices edged up today ...",3


In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7628 entries, 0 to 7627
Data columns (total 2 columns):
STORY      7628 non-null object
SECTION    7628 non-null int64
dtypes: int64(1), object(1)
memory usage: 119.3+ KB


In [20]:
train.shape

(7628, 2)

In [21]:
train.groupby("SECTION").describe()

Unnamed: 0_level_0,STORY,STORY,STORY,STORY
Unnamed: 0_level_1,count,unique,top,freq
SECTION,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1686,1673,This story has been published from a wire agen...,4
1,2772,2731,This story has been published from a wire agen...,13
2,1924,1914,"The consensus reads, “Exciting, entertaining, ...",3
3,1246,1233,This story has been published from a wire agen...,11


## Data Cleaning

In [22]:
# Removing duplicates to avoid overfitting
train.drop_duplicates(inplace= True)

In [23]:
# A punctuation strings for refrence (added other valid characters from the dataset)
all_punctuations = string.punctuation + '‘’,:”][],'

In [24]:
# Method to remove punctuation marks from the data
def punc_remover(raw_text):
    no_punct = "".join([i for i in raw_text if i not in all_punctuations])
    return no_punct

In [25]:
# Method to remove stopwords from the data
def stopword_remover(no_punc_text):
    words = no_punc_text.split()
    no_stp_words = " ".join([i for i in words if i not in stopwords.words('english')])
    return no_stp_words

In [30]:
# Method to lemmatize the words in the data
lemmer = nltk.stem.WordNetLemmatizer()
def lem(words):
    return " ".join([lemmer.lemmatize(word, 'v') for word in words.split()])

In [27]:
# Method to perform a complete cleaning
def text_cleaner(raw):
    cleaned_text = stopword_remover(punc_remover(raw))
    return lem(cleaned_text)

In [31]:
# Testing the cleaner method
text_cleaner("Hi!, this is a sample text to test the text cleaner method. Removes *@!#special charactes%$^* and stopwords. And lemmatizes, go, going - run, ran, running")

'Hi sample text test text cleaner method Removes special charactes stopwords And lemmatizes go go run run run'

In [32]:
# Applying the cleaner method to the entire data
train['CLEAN_STORY'] = train['STORY'].apply(text_cleaner)

## Data Preprocessing : Count Vectors and TF-IDF Vectors

### Creating Count Vectors

In [36]:
# Importing CountVectorizer library from sklearn
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
# creating a bag-of-words dictionary of words from the data
bow_dictionary = CountVectorizer().fit(train['CLEAN_STORY'])

In [40]:
# Total number of words in the bow_dictionary
len(bow_dictionary.vocabulary_)

35189

In [41]:
# Using the bow_dictionary to create count vectors for the cleaned data
bow = bow_dictionary.transform(train['CLEAN_STORY'])

In [42]:
print(bow.shape)

(7551, 35189)


### creating TF-IDF vectors

In [43]:
# Importing TfidfTransformer library from sklearn
from sklearn.feature_extraction.text import TfidfTransformer

In [44]:
# Fitting the bag of words data to the TF-IDF transformer
tfidf_transformer = TfidfTransformer().fit(bow)

In [45]:
# Transforming the bag of words model to TF-IDF vectors
storytfidf = tfidf_transformer.transform(bow)

### Training The Classifier

In [46]:
# Creating a Multinomial Naive Bayes classifier
from sklearn.naive_bayes import MultinomialNB

In [47]:
# Fitting the training data to the classifier
model = MultinomialNB().fit(storytfidf, train['SECTION'])

In [48]:
# Applying the cleaner method to the entire test data
test['CLEAN_STORY'] = test['STORY'].apply(text_cleaner)

### Creating A Pipeline To Pre-Process the Data & Initialise the model

In [50]:
from sklearn.pipeline import Pipeline

In [51]:
# Initializing the pipeline with necessary transformations and the required model
pipe = Pipeline([('bow', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('model',MultinomialNB())])

In [52]:
# Fitting the training data to the pipeline
pipe.fit(train['CLEAN_STORY'], train['SECTION'])

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('model',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [53]:
# Predicting the SECTION
test_pred_mnb = pipe.predict(test['CLEAN_STORY'])

In [56]:
test_pred_mnb

array([1, 2, 1, ..., 1, 0, 1])

## Save your solution

In [54]:
# Writting the predictions to an excel sheet
pd.DataFrame(test_pred_mnb, columns = ['SECTION']).to_excel("Predictions.xlsx")

In [55]:
sol = pd.read_excel('Predictions.xlsx')
sol.head()

Unnamed: 0.1,Unnamed: 0,SECTION
0,0,1
1,1,2
2,2,1
3,3,1
4,4,1
