In [None]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
import re
pd.set_option('display.max_colwidth', -1)
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [None]:
# read the data from drive
df = pd.read_json('News_Category_Dataset_v2.json', lines=True)
df.head()

In [None]:
# descriptive analysis of the dataset
df.describe().T

 The dataset timeline starts at 28-01-2012 and ends at 26-05-2018

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
# check for missing values if any
df.isna().sum()

In [None]:
df1 = df.copy()
df1.head()

In [None]:
# df1.aut_name + ' ' + 
df1['text'] = (df1.headline + ' ' + df1.short_description)
df1 = df1[(df1.text != ' ') | (df1.text != '')]

In [None]:
# Removing Punctuation
df1['text'] = df1['text'].str.replace('[^\w\s]','')

In [None]:
df1.text = [x.lower() for x in df1.text]

In [None]:
df1.category[df1.category=='THE WORLDPOST'] = 'WORLDPOST'
df1.category[df1.category=='GREEN'] = 'ENVIRONMENT'
df1.category[df1.category=='CULTURE & ARTS'] = 'ARTS'
df1.category[df1.category=='COMEDY'] = 'ENTERTAINMENT'
df1.category[(df1.category=='BLACK VOICES') | (df1.category=='LATINO VOICES') | (df1.category=='QUEER VOICES')] = 'VOICES'
df1.category[df1.category=='STYLE'] = 'STYLE & BEAUTY'
df1.category[df1.category=='ARTS & CULTURE'] = 'ARTS'
df1.category[df1.category=='COLLEGE'] = 'EDUCATION'
df1.category[df1.category=='SCIENCE'] = 'TECH'
df1.category[df1.category=='WEDDINGS'] = 'GOOD NEWS'
df1.category[df1.category=='TASTE'] = 'FOOD & DRINK'
df1.category[(df1.category=='PARENTING') | (df1.category=='FIFTY')] = 'PARENTS'
df1.category[df1.category=='WORLD NEWS'] = 'WORLDPOST'


In [None]:
df1.head()

## EDA

In [None]:
# distribution of categories in dataset
plt.figure(figsize=(16,8))
sns.countplot(df1.category, order=df1.category.value_counts().index, color='c')
plt.xticks(rotation=90)
plt.xlabel('Category',fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.title('Distribution of Categories', fontsize=15)
plt.show()

In [None]:
#Popular category per month
# a = df1.groupby(pd.Grouper(key='date', freq='M'))['category'].agg(lambda x:x.value_counts().index[0])
a = df1.category.value_counts()

import squarify
import matplotlib

MEDIUM_SIZE = 12.5
BIGGER_SIZE = 23

plt.rc('font', size=MEDIUM_SIZE)
plt.rc('figure', titlesize=BIGGER_SIZE)
# #Utilise matplotlib to scale our goal numbers between the min and max, then assign this scale to our values.

norm = matplotlib.colors.Normalize(vmin=a.values.min(), vmax=a.values.max())
colors = [matplotlib.cm.Blues(norm(value)) for value in a.values]

lbl= np.array(a.index)+ " \n " + a.values.astype("str")

plt.figure(figsize=(12,8))
squarify.plot(sizes=a.values[:20], label=lbl[0:20], alpha=0.7, color=colors)
plt.axis('off')
plt.title("News Category TreeMap")
plt.tight_layout()
plt.show()



In [None]:
all_words = ' '.join([text for text in df1['text']])

In [None]:
from wordcloud import WordCloud

plt.figure(figsize=(12,10))
wordcloud = WordCloud(width=800, height=500, random_state=21, 
                      max_font_size=110, background_color='white').generate(all_words)

plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, cohen_kappa_score

### Count Vectorizer

In [None]:
vect = CountVectorizer(min_df=5, stop_words='english')
X = vect.fit_transform(df1.text)

In [None]:
le = LabelEncoder()
y = le.fit_transform(df1.category)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

### Naive Bayes

In [None]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

In [None]:
y_pred = nb.predict(X_test)
print(f'train score: {nb.score(X_train, y_train):.4f}')
print(f'test score: {nb.score(X_test, y_test):.4f}')

In [None]:
print('KAPPA SCORE: ',cohen_kappa_score(y_test,y_pred))

### GridSearchCV

In [None]:
def predict_cat(title):
    stop = stopwords.words('english')
    if title:
        title = title.replace('[^\w\s]','')
        title = ' '.join(x for x in  title.split(' ') if x not in stop)
        cod = nb.predict(vect.transform([title]))
        return le.inverse_transform(cod)[0]
    else:
        print('text cannot be blank')


In [None]:
predict_cat("India’s largest ever ‘eye in the sky’ will take on its neighbours")

### SVM

In [None]:
%%time
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)

In [None]:
y_pred = svc.predict(X_test)
print(f'train score: {svc.score(X_train, y_train):.4f}')
print(f'test score: {svc.score(X_test, y_test):.4f}')

### Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)


In [None]:
y_pred_logreg = logreg.predict(X_test)
print(f'train score: {logreg.score(X_train, y_train):.4f}')
print(f'test score: {logreg.score(X_test, y_test):.4f}')

In [None]:
print('KAPPA SCORE: ',cohen_kappa_score(y_test,y_pred_logreg))

### TF-IDF Vectorizer

In [None]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df1.text)

In [None]:
le = LabelEncoder()
y = le.fit_transform(df1.category)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

### Naive Bayes with TF-IDF Vectorizer

In [None]:
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train, y_train)

In [None]:
%%time
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)y_pred = svc.predict(X_test)
print(f'train score: {svc.score(X_train, y_train):.4f}')
print(f'test score: {svc.score(X_test, y_test):.4f}')y_pred_tfidf = nb_tfidf.predict(X_test)
nb_tfidf.score(X_test, y_test)

In [None]:
print('KAPPA SCORE: ',cohen_kappa_score(y_test,y_pred_tfidf))