In [1]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score ,confusion_matrix


### Load the dataset

- Load the train data and using all your knowledge try to explore the different statistical properties of the dataset.

In [2]:
# Code starts here

# load data
news = pd.read_csv(r"/Users/rahulkosamkar/Documents/Data_Science/Projects/NLP_assessment/train.csv")

# distribution of classes
dist = news.CATEGORY.value_counts()

# display class distribution
print(dist)

# display data
print(news.head())

# Code ends here

e    122013
b     92679
t     86846
m     36397
Name: CATEGORY, dtype: int64
       Id                                              TITLE CATEGORY
0   50846         Ukraine to get $18 billion rescue from IMF        b
1  234375  McDonald's Abandons Headquarters to Avoid Prot...        b
2   63422  New study finds evidence that Autism begins in...        m
3  353942  Prime Minister Modi Says Meeting With Facebook...        t
4  311586  New robot guides at Tokyo museum almost outper...        t


### Visualize and Preprocess the data

- Retaining only alphabets (Using regular expressions)
- Removing stopwords (Using nltk library)

In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rahulkosamkar/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
# Code starts here

# stopwords 
stop = set(stopwords.words('english'))

# retain only alphabets
news['TITLE'] = news['TITLE'].apply(lambda x:re.sub("[^a-zA-Z]", " ",x))

# convert to lowercase and tokenize
news['TITLE'] = news['TITLE'].apply(lambda x:x.lower().split())

# remove stopwords
news['TITLE'] = news['TITLE'].apply(lambda x:[i for i in x if i not in stop])

# join list elements
news['TITLE'] = news['TITLE'].apply(lambda x: ' '.join(x))

# split into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(news["TITLE"], news["CATEGORY"],test_size = 0.2,random_state=3)

# Code ends here

In [7]:
# Code starts here

# initialize count vectorizer
count_vectorizer = CountVectorizer()

# initialize tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3))

# fit and transform with count vectorizer
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)


# fit and transform with tfidf vectorizer
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Code ends here

In [8]:
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(270348,) (270348,)
(67587,) (67587,)


In [9]:
print(X_train_count.shape, Y_train.shape)
print(X_test_tfidf.shape, Y_test.shape)

(270348, 42616) (270348,)
(67587, 1665439) (67587,)


### Model building

- Now let's come to the actual task, using any classifier, predict the `CATEGORY`. Use different techniques you have learned to imporove the performance of the model.
- Try improving upon the `accuracy_score` ([Accuracy Score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html))

In [10]:
# Code starts here

# initialize multinomial naive bayes
nb_1 = MultinomialNB()
nb_2 = MultinomialNB()

# fit on count vectorizer training data
nb_1.fit(X_train_count, Y_train)

# fit on tfidf vectorizer training data
nb_2.fit(X_train_tfidf, Y_train)

# accuracy with count vectorizer
acc_count_nb = accuracy_score(nb_1.predict(X_test_count), Y_test)

# accuracy with tfidf vectorizer
acc_tfidf_nb = accuracy_score(nb_2.predict(X_test_tfidf), Y_test)

# display accuracies
print(acc_count_nb, acc_tfidf_nb)

# Code ends here

0.9272641188394218 0.9295870507642002


In [11]:
import warnings
warnings.filterwarnings('ignore')

# initialize logistic regression
logreg_1 = OneVsRestClassifier(LogisticRegression(random_state=10))
logreg_2 = OneVsRestClassifier(LogisticRegression(random_state=10))

# fit on count vectorizer training data
logreg_1.fit(X_train_count, Y_train)

# fit on tfidf vectorizer training data
logreg_2.fit(X_train_tfidf, Y_train)

# accuracy with count vectorizer
acc_count_logreg = accuracy_score(logreg_1.predict(X_test_count), Y_test)

# accuracy with tfidf vectorizer
acc_tfidf_logreg = accuracy_score(logreg_2.predict(X_test_tfidf), Y_test)

# display accuracies
print(acc_count_logreg, acc_tfidf_logreg)

# Code ends here

0.9461434891325254 0.9420598635832335


In [12]:
print(Y_test.head())

83168     m
10859     e
303271    e
127183    e
94932     e
Name: CATEGORY, dtype: object


In [13]:
logreg_2.predict(X_test_tfidf).shape, Y_test.shape

((67587,), (67587,))

### Prediction on the test data and creating the sample submission file.

- Load the test data and store the `Id` column in a separate variable.
- Perform the same operations on the test data that you have performed on the train data.
- Create the submission file as a `csv` file consisting of the `Id` column from the test data and your prediction as the second column.

In [14]:
# Code Starts here
# Prediction on test data

# Read the test data
test = pd.read_csv(r'/Users/rahulkosamkar/Documents/Data_Science/Projects/NLP_assessment/test.csv')

# Storing the id from the test file
id_ = test['Id']

# Apply the transformations on test
# retain only alphabets
test['TITLE'] = test['TITLE'].apply(lambda x:re.sub("[^a-zA-Z]", " ",x))

# convert to lowercase and tokenize
test['TITLE'] = test['TITLE'].apply(lambda x:x.lower().split())

# remove stopwords
test['TITLE'] = test['TITLE'].apply(lambda x:[i for i in x if i not in stop])

# join list elements
test['TITLE'] = test['TITLE'].apply(lambda x: ' '.join(x))


test_count = count_vectorizer.transform(test['TITLE'])
test_tfidf = tfidf_vectorizer.transform(test['TITLE'])


# Predict on the test data
y_pred_test = logreg_1.predict(test_count)
print(test_count.shape)
y_pred_test = y_pred_test.flatten()

# Create a sample submission file
sample_submission = pd.DataFrame({'Id':id_,'CATEGORY':y_pred_test})
print(sample_submission.head())

# Convert the sample submission file into a csv file
sample_submission.to_csv('sample_submission.csv',index=False)

# Code ends here

(84484, 42616)
       Id CATEGORY
0   86998        m
1  112926        t
2  280943        m
3   37154        m
4  152800        t
