<a href="https://colab.research.google.com/github/ArmandDS/news_category/blob/master/News_Analysis_AO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load the Data 

ModuleNotFoundError: No module named 'google.colab'

In [1]:
!

#### Load libraries we will need 

In [1]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import re
from nltk.stem.snowball import SnowballStemmer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from sklearn.linear_model import SGDClassifier
import logging
from textblob import TextBlob
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pickle

In [2]:
df_news = pd.read_json("News_Category_Dataset_v2.json", lines = True)
df_news.columns

Index(['category', 'headline', 'authors', 'link', 'short_description', 'date'], dtype='object')

In [3]:
df_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200853 entries, 0 to 200852
Data columns (total 6 columns):
category             200853 non-null object
headline             200853 non-null object
authors              200853 non-null object
link                 200853 non-null object
short_description    200853 non-null object
date                 200853 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.2+ MB


In [4]:
df_news.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


Category Distribution

In [5]:
len(df_news['category'].unique())

41

There are 41 categories in the dataset

In [6]:
df_news['category'].value_counts().plot( kind='bar', figsize=(15,10))

<matplotlib.axes._subplots.AxesSubplot at 0x27edb6b7d08>

#### Data pre-processing

load the nltk utilities

In [7]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pranav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pranav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Pranav\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Pranav\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

#### Function to clean, tokenize, remove stop word, and not alphanumeric from data

In [8]:
stop_words_ = set(stopwords.words('english'))
wn = WordNetLemmatizer()

my_sw = ['make', 'amp',  'news','new' ,'time', 'u','s', 'photos',  'get', 'say']
def black_txt(token):
    return  token not in stop_words_ and token not in list(string.punctuation)  and len(token)>2 and token not in my_sw
  
  
def clean_txt(text):
  clean_text = []
  clean_text2 = []
  text = re.sub("'", "",text)
  text=re.sub("(\\d|\\W)+"," ",text)    
  clean_text = [ wn.lemmatize(word, pos="v") for word in word_tokenize(text.lower()) if black_txt(word)]
  clean_text2 = [word for word in clean_text if black_txt(word)]
  return " ".join(clean_text2)

In [9]:
df_news.short_description[0]

'She left her husband. He killed their children. Just another day in America.'

#### Processing the Data and TF-IDF

We need to merge the categories _WORDLPOST_ with _THE WORDPOST_, because there *are* basically the same, next we combine the columns _headline with _short_description_ these are our predictor text

In [10]:
df_news.category = df_news.category.map(lambda x: "WORLDPOST" if x == "THE WORLDPOST" else x)

In [11]:
df_news['text'] = df_news['headline'] + " " + df_news['short_description']

example output

In [12]:
df_news.text[10]

'Justin Timberlake Visits Texas School Shooting Victims The pop star also wore a "Santa Fe Strong" shirt at his show in Houston.'

In [13]:
clean_txt(df_news.text[10])

'justin timberlake visit texas school shoot victims pop star also wear santa strong shirt show houston'

In [14]:
clean_txt(df_news.text[5])

'morgan freeman devastate sexual harassment claim could undermine legacy right equate horrific incidents sexual assault misplace compliment humor statement'

#### Next we are going to create some news variables columns to try to improve the quaity of our classifier, we will create:
 * Polarity: to check the sentiment of the text
 * Subjectivity: to check if text is objective or subjective
 * The number of word in the text

In [15]:
blob = TextBlob((df_news.text[1]))
str(blob.correct())

def polarity_txt(text):
  return TextBlob(text).sentiment[0] 

In [16]:
def subj_txt(text):
  return  TextBlob(text).sentiment[1]

In [17]:
def len_text(text):
  if len(text.split())>0:
         return len(set(clean_txt(text).split()))/ len(text.split())
  else:
         return 0

In [18]:
df_news['polarity'] = df_news['text'].apply(polarity_txt)
df_news.head(2)

Unnamed: 0,category,headline,authors,link,short_description,date,text,polarity
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26,There Were 2 Mass Shootings In Texas Last Week...,-0.05
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,0.0


In [19]:
df_news['subjectivity'] = df_news['text'].apply(subj_txt)
df_news.head(2)

Unnamed: 0,category,headline,authors,link,short_description,date,text,polarity,subjectivity
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26,There Were 2 Mass Shootings In Texas Last Week...,-0.05,0.266667
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,0.0,0.0


In [20]:
df_news['len'] = df_news['text'].apply(len_text)
df_news.head(2)

Unnamed: 0,category,headline,authors,link,short_description,date,text,polarity,subjectivity,len
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26,There Were 2 Mass Shootings In Texas Last Week...,-0.05,0.266667,0.444444
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,0.0,0.0,0.5


#### Make the Custom class for feature union Transformer of sklearn

In [21]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction import DictVectorizer
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, data):
        return [{'pos':  row['polarity'], 'sub': row['subjectivity'],  'len': row['len']} for _, row in data.iterrows()]


### Make our Custom Pipeline

In [22]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[

            # Pipeline for pulling features from the text
            ('text', Pipeline([
                ('selector', ItemSelector(key='text')),
                ('tfidf', TfidfVectorizer( min_df =3, max_df=0.2, max_features=None, 
                    strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                    ngram_range=(1, 10), use_idf=1,smooth_idf=1,sublinear_tf=1,
                    stop_words = None, preprocessor=clean_txt)),
            ])),

            # Pipeline for pulling metadata features
            ('stats', Pipeline([
                ('selector', ItemSelector(key=['polarity', 'subjectivity', 'len'])),
                ('stats', TextStats()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])),

        ],

        # weight components in FeatureUnion
        transformer_weights={
            'text': 0.9,
            'stats': 1.5,
        },
    ))
])

##### Build the pipeline

In [23]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
seed = 40
X = df_news[['text', 'polarity', 'subjectivity','len']]
y =df_news['category']
encoder = LabelEncoder()
y = encoder.fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, stratify=y)

In [24]:
pipeline.fit(x_train)

Pipeline(memory=None,
         steps=[('union',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('text',
                                                 Pipeline(memory=None,
                                                          steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('tfidf',
                                                                  TfidfVectorizer(analyzer='word',
                                                                                  binary=False,
                                                                                  decode_error='strict',
                                                                                  dtype=<class 'numpy.float64'>,
                                                                                  encoding='utf-8',
                      

### Transform and train the ML models

In [25]:
%%time
train_vec = pipeline.transform(x_train)
test_vec = pipeline.transform(x_test)
print("Checking that the number of features in train and test correspond: %s - %s" % (train_vec.shape, test_vec.shape))

Checking that the number of features in train and test correspond: (160682, 189409) - (40171, 189409)
Wall time: 1min 46s


In [26]:
clf_sv = LinearSVC(C=1, class_weight='balanced', multi_class='ovr', random_state=40, max_iter=10000) #Support Vector machines
clf_sgd = SGDClassifier(max_iter=200,) # Stochastic Gradient Classifier

In [27]:
%%time
from sklearn.model_selection import cross_val_score

clfs = [clf_sv, clf_sgd]
cv = 5
for clf in clfs:
    scores = cross_val_score(clf,train_vec, y_train, cv=cv, scoring="accuracy" )
    print (scores)
    print (("Mean score: {0:.3f} (+/-{1:.3f})").format(
        np.mean(scores), np.std(scores)))

[0.6045839  0.61022275 0.60707061 0.60628132 0.60961335]
Mean score: 0.608 (+/-0.002)
[0.57933201 0.58129044 0.57268229 0.57714695 0.5744661 ]
Mean score: 0.577 (+/-0.003)
Wall time: 5min 10s


In [28]:
%%time
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score
clf_sv.fit(train_vec, y_train )
y_pred = clf_sv.predict(test_vec)
list_result =[]
list_result.append(("SVC accuracy",accuracy_score(y_test, y_pred)))






Wall time: 1min 9s


In [29]:
x_test.tail()

Unnamed: 0,text,polarity,subjectivity,len
169394,New Year's Engagement: Rings From Real Proposa...,0.247273,0.404242,0.5
54706,Elizabeth Warren Asks J.K. Rowling If She Can ...,-0.15,0.4,0.5
72167,Here's The One Photo That Encapsulates Kardash...,0.25,0.6,0.615385
43528,Neil Patrick Harris Goes Full Olaf For Latest ...,0.116667,0.816667,0.764706
13987,Reproductive Health And Rights In An Age Of In...,0.0,0.066667,0.515152


In [None]:
t=input(" Enter text")
t=clean_txt(t)
p=polarity_txt(t)
s=subj_txt(t)
l=len_text(t)
df2 = pd.DataFrame({"text":[t], 
                    "polarity":[p],
                   "subjectivity":[s],
                   "len":[l]}) 
k=pipeline.transform(df2)
j=clf_sv.predict(k)
print(j)
print(encoder.inverse_transform(j[0]))