In [None]:
#Importing packages
import sys
import numpy as np
import tensorflow as tf
import seaborn as sns
import pandas as pd
import string
import matplotlib.pyplot as plt
import pickle
import gensim.parsing.preprocessing as gsp

from gensim import utils
from pprint import pprint
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.datasets import fetch_20newsgroups
from wordcloud import WordCloud

In [None]:
#Importing data 
twenty_train = fetch_20newsgroups(subset='train', random_state = 42,shuffle=True, remove=('headers', 'footers', 'quotes'))
twenty_test = fetch_20newsgroups(subset='test', random_state = 42, shuffle=True, remove=('headers', 'footers', 'quotes'))

# Step 1:  Explanatory Data Analysis#

In [None]:
print("Categories names are:\n")
for i in twenty_train.target_names:
    print(i)
print("\n Total:{}".format(len(twenty_train.target_names)))

In [None]:
print("Length of the training data set is {}".format(len(twenty_train.data))) 
print("Length of the test data set is {}".format(len(twenty_test.data))) 
print("Length of the total dataset is {}".format(11314+7532))

In [None]:
names = []
test_names = []
for i in twenty_train.target:
    names.append(twenty_train.target_names[i])
for i in twenty_test.target:
    test_names.append(twenty_test.target_names[i])

In [None]:
#Putting dataset into a pandas dataframe
df_train = pd.DataFrame(data = np.c_[twenty_train.data,twenty_train.target,names] , columns = ("Text","Target_id","Target_name")) 
df_test  = pd.DataFrame(data = np.c_[twenty_test.data,twenty_test.target,test_names] , columns = ("Text","Target","Target_name"))

In [None]:
#Are the classes balanced? 
count_target = df_train['Target_id'].value_counts()

plt.figure(figsize=(8,4))
sns.barplot(count_target.index,  count_target.values, alpha=0.8)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Target_id', fontsize=12)
plt.show()

Above plot shows that the classes are little unbalanced. News reports from categories 'comp.graphics', 'talk.politics.misc' & 'talk.religion.misc' are fewer compared to other categories. 

# Examine the properties by target#

In [None]:
###Creating extra features 
def add_features(df):
    df['Length_of_text'] = df['Text'].apply(lambda x: len(str(x))) # length of each text
    df['unique'] = df['Text'].apply(lambda x: len(set(str(x)))) # Unique characters
    df['punctuations'] = df['Text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
    df['uniq_punctuations'] = df['Text'].apply(lambda x: len(set([c for c in str(x) if c in string.punctuation])))
    df['letters'] = df['Text'].apply(lambda x: len([c for c in str(x) if c.isalpha()])) 
    df['uniq_letters'] = df['Text'].apply(lambda x: len(set([c for c in str(x) if c.isalpha()])))
    df['numbers'] = df['Text'].apply(lambda x: len([c for c in str(x) if c.isdigit()]))
    df['uniq_numbers'] = df['Text'].apply(lambda x: len(set([c for c in str(x) if c.isdigit()])))
    df['uppercase'] = df['Text'].apply(lambda x: len([c for c in str(x) if c.isupper()]))
    df['uniq_uppercase'] = df['Text'].apply(lambda x: len(set([c for c in str(x) if c.isupper()])))
    df['lowercase'] = df['Text'].apply(lambda x: len([c for c in str(x) if c.islower()]))
    df['uniq_lowercase'] = df['Text'].apply(lambda x: len(set([c for c in str(x) if c.islower()])))

In [None]:
#Applying features on training and test data
add_features(df_train)
add_features(df_test)

In [None]:
#Exploring each feature
plt.figure(figsize=(12,12))
sns.catplot(x='Target_id', data=df_train,kind="count")
plt.xlabel('Target_id', fontsize=12)
plt.ylabel('Length of text', fontsize=12)
plt.title("Length of text by target", fontsize=15)
plt.show()
###########
plt.figure(figsize=(12,12))
sns.violinplot(x='Target_id', y='unique', data=df_train)
plt.xlabel('Target_id', fontsize=12)
plt.ylabel('Number of unique characters in text', fontsize=12)
plt.title("Number of unique characters by target", fontsize=15)
plt.show()
###########
plt.figure(figsize=(12,12))
sns.violinplot(x='Target_id', y='uniq_punctuations', data=df_train)
plt.xlabel('Target_id', fontsize=12)
plt.ylabel('Number of unique punctuations in text', fontsize=12)
plt.title("Number of unique punctuations by target", fontsize=15)
plt.show()
###########
plt.figure(figsize=(12,12))
sns.violinplot(x='Target_id', y='uniq_letters', data=df_train)
plt.xlabel('Target_id', fontsize=12)
plt.ylabel('Number of unique letters in text', fontsize=12)
plt.title("Number of unique letters by target", fontsize=15)
plt.show()
###########
plt.figure(figsize=(12,12))
sns.violinplot(x='Target_id', y='uniq_numbers', data=df_train)
plt.xlabel('Target_id', fontsize=12)
plt.ylabel('Number of unique characters in text', fontsize=12)
plt.title("Number of unique characters by target", fontsize=15)
plt.show()
###########
plt.figure(figsize=(12,12))
sns.violinplot(x='Target_id', y='uniq_uppercase', data=df_train)
plt.xlabel('Target_id', fontsize=12)
plt.ylabel('Number of unique uppercase in text', fontsize=12)
plt.title("Number of unique uppercase by target", fontsize=15)
plt.show()
###########
plt.figure(figsize=(12,12))
sns.violinplot(x='Target_id', y='uniq_lowercase', data=df_train)
plt.xlabel('Target_id', fontsize=12)
plt.ylabel('Number of unique lowercase in text', fontsize=12)
plt.title("Number of unique lowercase by target", fontsize=15)
plt.show()
###########

In [None]:
print("Printing first post to explore \n")
print("\n".join(df_train['Text'][0].split("\n")[:])) #prints first line of the first data file
print("\n Above post belongs to {}".format(df_train.Target_name[0]))

# Cleaning text

In [None]:
#create filters to clean the text
filters = [
           gsp.strip_tags,  #Unicode string without tags.
           gsp.strip_punctuation, #Unicode string without punctuation characters.
           gsp.strip_multiple_whitespaces, #Unicode string without repeating in a row whitespace characters.
           gsp.strip_numeric, #Unicode string without digits.
           gsp.remove_stopwords, #Unicode string without STOPWORDS.
           gsp.strip_short, #Unicode string without short words.
           #gsp.stem_text #Unicode lowercased and porter-stemmed version of string text.
          ]

#fucntion to clean text
def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

#cleaning training and test dataset
df_train['Text'] = df_train['Text'].apply(clean_text)
df_test['Text'] = df_test['Text'].apply(clean_text)

In [None]:
########################
#Exploring the cleaned dataset
########################
#Counting number of features in the new cleaned dataset
vectorizer = TfidfVectorizer() #vectorizer
vectors = vectorizer.fit_transform(df_train.Text)
clf = MultinomialNB(alpha=.01) #classifier
clf.fit(vectors, twenty_train.target)
print("Number of features in the datset are {} \n \n".format(np.shape(vectors)[1])) 
######################
#creating function to display some features for each category
def show_top10(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-10:]
        print("{}: {}".format(category, " ".join(feature_names[top10])))
#########################
print("Display first 10 features for each category \n")
show_top10(clf, vectorizer, twenty_train.target_names)

We can see that, all the features are only words. 

# Exploring data using word cloud

In [None]:
#define functions
def plot_word_cloud(text):
    wordcloud_instance = WordCloud(width = 800, height = 800, 
                background_color ='black', 
                stopwords='english',
                min_font_size = 5).generate(text) 
             
    plt.figure(figsize = (10, 10), facecolor = None) 
    plt.imshow(wordcloud_instance) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.show()

def plot_word_cloud_for_cat(category):
    text_df = df_train.loc[df_train['Target_name'] == str(category)]
    texts = ''
    for index, item in text_df.iterrows():
        texts = texts + ' ' + clean_text(item['Text'])
    plot_word_cloud(texts)

#plotting word cloud for one category
plot_word_cloud_for_cat('comp.graphics')

# Step 2: Testing different classifier

# Multi-nominal Naive bayesian classifier

In [None]:
#Multi-nominal Naive bayesian classifier
#making pipeline
mulit_nb_clf = Pipeline([
      ('vect', CountVectorizer(stop_words='english')),
      ('tfidf', TfidfTransformer()),
      ('clf', MultinomialNB(fit_prior=False)), 
            ])
#creating an array of possible parameters
parameters = {'vect__max_df': (0.5,0.75,1.0),
              'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3)}

#gridding over all the parameters
gs_clf = GridSearchCV(mulit_nb_clf, parameters,n_jobs=-1)

#fitting the model
gs_clf = gs_clf.fit(df_train.Text,df_train.Target_id) 

#name of the file used for saving the model 
filename = 'mulit_nb_clf.sav' 
# save the model to disk
#pickle.dump(gs_clf, open(filename, 'wb')) #uncomment to save the model

In [None]:
#load the model from disk
gs_clf = pickle.load(open(filename, 'rb'))
#predict the categories
predicted_mulit_nb = gs_clf.predict(df_test.Text)

In [None]:
print("Cross validation accuracy using Naive Bayes Classifier on training dataset, with help of gridsearch is {} %".format(gs_clf.best_score_*100)) 
print("Accuracy using Naive Bayes Classifier on test dataset, with help of gridsearch is {} %".format(np.mean(predicted == df_test.Target)*100)) 


# Linear SVM

In [None]:
#linear SVM
#making pipeline
sgd_clf = Pipeline([
      ('vect', CountVectorizer(stop_words='english')),
      ('tfidf', TfidfTransformer()),
      ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',random_state=42)), 
            ])
#creating an array of possible parameters
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf-svm__alpha': (1e-2, 1e-3)}

#gridding over all the parameters
sgd_clf = GridSearchCV(sgd_clf, parameters,n_jobs=-1)

#fitting the model
sgd_clf = sgd_clf.fit(df_train.Text,df_train.Target_id)

filename = 'sgd_clf.sav'
#pickle.dump(sgd_clf, open(filename, 'wb'))

In [None]:
filename = 'sgd_clf.sav'
# load the model from disk
sgd_clf = pickle.load(open(filename, 'rb'))

#predict the categories
predicted = sgd_clf.predict(df_test.Text)

In [None]:
print("Cross validation accuracy using SGDClassifier on training dataset, with help of gridsearch is {} %".format(sgd_clf.best_score_*100)) 
print("Accuracy using SGDClassifier on test dataset, with help of gridsearch is {} %".format(np.mean(predicted == df_test.Target)*100)) 

# SVC

In [None]:
###SVC
#making pipeline
svc_clf = Pipeline([
      ('vect', CountVectorizer(stop_words='english')),
      ('tfidf', TfidfTransformer()),
      ('svc_clf', svm.SVC()), 
            ])

#creating an array of possible parameters
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False)}

#gridding over all the parameters
svc_clf = GridSearchCV(svc_clf, parameters,n_jobs=-1)

#fitting the model
svc_clf = svc_clf.fit(df_train.Text,df_train.Target_id)

filename = 'svc_clf.sav'
#pickle.dump(svc_clf, open(filename, 'wb'))

In [None]:
filename = 'svc_clf.sav'
# load the model from disk
svc_clf = pickle.load(open(filename, 'rb'))

#predict the categories
predicted = svc_clf.predict(df_test.Text)

In [None]:
print("Cross validation accuracy using SVC Classifier on training dataset, with help of gridsearch is {} %".format(svc_clf.best_score_*100)) 
print("Accuracy using SVC Classifier on test dataset, with help of gridsearch is {} %".format(np.mean(predicted == df_test.Target)*100)) 

# Random forest

In [None]:
###Random forest
#making pipeline
rf_clf = Pipeline([
      ('vect', CountVectorizer(stop_words='english')),
      ('tfidf', TfidfTransformer(use_idf=True)),
      ('clf_rf', RandomForestClassifier(n_estimators=100, 
                               random_state=46, 
                               max_features = 'sqrt',
                               n_jobs=-1, verbose = 1)), 
            ])

#creating an array of possible parameters
parameters = {'vect__ngram_range': [(1, 1), (1, 2)]}

#gridding over all the parameters
rf_clf = GridSearchCV(rf_clf, parameters,n_jobs=-1)

#fitting the model
rf_clf = rf_clf.fit(df_train.Text,df_train.Target_id)

filename = 'rf_clf.sav'
#pickle.dump(rf_clf, open(filename, 'wb'))

In [None]:
#load the model from disk
rf_clf = pickle.load(open(filename, 'rb'))

#predict the categories
predicted = rf_clf.predict(df_test.Text)

In [None]:
print("Cross validation accuracy using Random forest Classifier on training dataset, with help of gridsearch is {} %".format(rf_clf.best_score_*100)) 
print("Accuracy using Random forest Classifier on test dataset, with help of gridsearch is {} %".format(np.mean(predicted == df_test.Target)*100)) 

In [None]:
#printing classification report
print(classification_report(df_test.Target, predicted_mulit_nb, target_names=twenty_train.target_names))

In [None]:
#Plot the confusion matrix
cm = metrics.confusion_matrix(df_test.Target, predicted_mulit_nb)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(cm, annot=True, fmt='d',
            xticklabels=twenty_train.target_names, yticklabels=twenty_train.target_names)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
plt.show()

In [None]:
# plot_word_cloud_for_cat('rec.sport.hockey') 
# plot_word_cloud_for_cat('rec.motorcycles') 