In [1]:
import pandas as pd
import numpy as np
import json

from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

import matplotlib.pyplot as plt



In [2]:
json_filename = 'News_Category_Dataset.json'

In [3]:
raw_values = []

In [4]:
with open(json_filename) as jf:
    for line in jf.readlines():
        raw_values.append(json.loads(line))

In [5]:
n_df = pd.DataFrame(raw_values)

In [6]:
n_df.head()

Unnamed: 0,authors,category,date,headline,link,short_description
0,Melissa Jeltsen,CRIME,2018-05-26,There Were 2 Mass Shootings In Texas Last Week...,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...
1,Andy McDonald,ENTERTAINMENT,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.
2,Ron Dicker,ENTERTAINMENT,2018-05-26,Hugh Grant Marries For The First Time At Age 57,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...
3,Ron Dicker,ENTERTAINMENT,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...
4,Ron Dicker,ENTERTAINMENT,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ..."


In [7]:
n_df['headline+short_description']=n_df['headline']+n_df['short_description']

In [8]:
n_df.head(4)

Unnamed: 0,authors,category,date,headline,link,short_description,headline+short_description
0,Melissa Jeltsen,CRIME,2018-05-26,There Were 2 Mass Shootings In Texas Last Week...,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,There Were 2 Mass Shootings In Texas Last Week...
1,Andy McDonald,ENTERTAINMENT,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,Will Smith Joins Diplo And Nicky Jam For The 2...
2,Ron Dicker,ENTERTAINMENT,2018-05-26,Hugh Grant Marries For The First Time At Age 57,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,Hugh Grant Marries For The First Time At Age 5...
3,Ron Dicker,ENTERTAINMENT,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,Jim Carrey Blasts 'Castrato' Adam Schiff And D...


In [9]:
train_text, test_text, ytrain, ytest = train_test_split(
    n_df['headline+short_description'], n_df['category'], random_state=42)

In [10]:
%%time
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 10))
word_vectorizer.fit(train_text)

Wall time: 2min 38s


In [11]:
from sklearn.linear_model import SGDClassifier
sgd_cls = SGDClassifier(max_iter=2)
sgd_cls.fit(word_vectorizer.transform(train_text), ytrain)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=2, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [12]:
%%time
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 5))
char_vectorizer.fit(train_text)

Wall time: 1min 45s


In [13]:
%%time
X = hstack([word_vectorizer.transform(train_text), char_vectorizer.transform(train_text)])

Wall time: 3min 20s


In [14]:
from sklearn.linear_model import SGDClassifier
sgd_cls = SGDClassifier(max_iter=2)
sgd_cls.fit(X, ytrain)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=2, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [15]:
#import pickle
#with open("sgd_classifier_model", "wb") as handle:
#    pickle.dump(sgd_cls, handle)

In [None]:
predict = sgd_cls.predict(
    hstack([word_vectorizer.transform(test_text), char_vectorizer.transform(test_text)]))

In [27]:
acc = np.mean(ytest == predict)

In [28]:
print('accuracy: {0:.3}'.format(acc))

accuracy: 0.58
