In [None]:
'''
Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using scikit-learn. Use the code found in chapter 12 of the Applied Text Analysis with Python book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.
'''

In [1]:
#import json library to read data in jsonl file
import json
#import pandas library
import pandas as pd

In [2]:
#check versions of packages
print('pandas version:', pd.__version__)
# print('numpy version:', np.__version__)
# print('scikit-learn version:', sklearn.__version__)
# print('NLTK version:', nltk.__version__)

pandas version: 1.2.4


In [3]:
#read in the data as a dataframe
filename = "/home/arindam/Documents/mygithub/bu_dsc/data/raw/categorized-comments.jsonl"
with open(filename, 'r') as f:
    jsonl_list = list(f)

list1 = []
for obj in jsonl_list:
    res = json.loads(obj)
    list1.append(res)
    
comments = pd.DataFrame(list1)

#display the first few rows of data
comments.head()
# len(list1)


Unnamed: 0,cat,txt
0,sports,Barely better than Gabbert? He was significant...
1,sports,Fuck the ducks and the Angels! But welcome to ...
2,sports,Should have drafted more WRs.\n\n- Matt Millen...
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg)
4,sports,No!! NOO!!!!!


In [4]:
# print the dimension of the dataframe
print('The dataframe has a dimension of:',comments.shape)
print('It has {} comments'.format(comments.shape[0]))

The dataframe has a dimension of: (606476, 2)
It has 606476 comments


In [5]:
print('The target names are :', comments['cat'].unique())
print('This shows that there are only 3 categories in the total dataset')

The target names are : ['sports' 'science_and_technology' 'video_games']
This shows that there are only 3 categories in the total dataset


In [6]:
#Convert text to lowercase and romove punctuation
#define a function to clean the text
# import the required libraries here
#import regular expressions library
import re

def clean_text(text):
    """
    Remove punctuations and special characters, makes lower case
    Args: text
    Output: text
    """
    text=text.lower() #makes text lowercase
    text=re.sub('\\d|\\W+|_',' ',text) #removes extra white space
    text=re.sub('[^a-zA-Z]'," ", text) #removes any non-alphabetic characters
    return text

In [7]:
#import word tokenizer from NLTK
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords

def tokenize_text(txt):
    """
    Takes in a sentence, tokenizes the words into a list,
    """
    stop_words = stopwords.words('english')
    tokenizer = TreebankWordTokenizer()
    tokens = tokenizer.tokenize(txt)
    return [token for token in tokens if token not in stop_words]

In [8]:
#Apply NLTK's PorterStemmer
#define a function to stem the words
from nltk.stem.porter import PorterStemmer

def porter_stem_text(token_list):

    porter = PorterStemmer()
    return (" ".join (porter.stem(token) for token in token_list))

In [9]:
# Testing the functions
# Taking a sample of the dataset
sample_cmnts = comments[:50000]
# txt = "barely than significantly especially is an the ? better surrounded."
# sample_cmnts['cat'].unique()
# creating a dictionary to replace the string values to numeric
d = {'sports':1,'science_and_technology':2,'video_games':3}
sample_cmnts['ncat'] = sample_cmnts['cat'].map(d)
sample_cmnts

Unnamed: 0,cat,txt,ncat
0,sports,Barely better than Gabbert? He was significant...,1
1,sports,Fuck the ducks and the Angels! But welcome to ...,1
2,sports,Should have drafted more WRs.\n\n- Matt Millen...,1
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg),1
4,sports,No!! NOO!!!!!,1
...,...,...,...
49995,sports,Florida State\nAlabama\nMichigan\nAlabama\nAla...,1
49996,sports,Never does.,1
49997,sports,I think Tosh is our best one left. Napier is g...,1
49998,sports,"Close, and we could really use a tight end...",1


In [10]:
# Cleaning and tokenizing the texts in the comments
# Using the transformed column for the model

sample_cmnts['cleaned']=sample_cmnts['txt'].apply(clean_text)
sample_cmnts['tokenized']=sample_cmnts['cleaned'].apply(tokenize_text)
sample_cmnts['stemmed']=sample_cmnts['tokenized'].apply(porter_stem_text)
sample_cmnts

Unnamed: 0,cat,txt,ncat,cleaned,tokenized,stemmed
0,sports,Barely better than Gabbert? He was significant...,1,barely better than gabbert he was significantl...,"[barely, better, gabbert, significantly, bette...",bare better gabbert significantli better year ...
1,sports,Fuck the ducks and the Angels! But welcome to ...,1,fuck the ducks and the angels but welcome to a...,"[fuck, ducks, angels, welcome, new, niners, fans]",fuck duck angel welcom new niner fan
2,sports,Should have drafted more WRs.\n\n- Matt Millen...,1,should have drafted more wrs matt millen probably,"[drafted, wrs, matt, millen, probably]",draft wr matt millen probabl
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg),1,done https i imgur com yz pm jpg,"[done, https, imgur, com, yz, pm, jpg]",done http imgur com yz pm jpg
4,sports,No!! NOO!!!!!,1,no noo,[noo],noo
...,...,...,...,...,...,...
49995,sports,Florida State\nAlabama\nMichigan\nAlabama\nAla...,1,florida state alabama michigan alabama alabama,"[florida, state, alabama, michigan, alabama, a...",florida state alabama michigan alabama alabama
49996,sports,Never does.,1,never does,[never],never
49997,sports,I think Tosh is our best one left. Napier is g...,1,i think tosh is our best one left napier is go...,"[think, tosh, best, one, left, napier, gone, c...",think tosh best one left napier gone cristob g...
49998,sports,"Close, and we could really use a tight end...",1,close and we could really use a tight end,"[close, could, really, use, tight, end]",close could realli use tight end


In [11]:
comments['cat'].unique()
# creating a dictionary to replace the string values to numeric
d = {'sports':1,'science_and_technology':2,'video_games':3}
comments['ncat'] = comments['cat'].map(d)
comments

Unnamed: 0,cat,txt,ncat
0,sports,Barely better than Gabbert? He was significant...,1
1,sports,Fuck the ducks and the Angels! But welcome to ...,1
2,sports,Should have drafted more WRs.\n\n- Matt Millen...,1
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg),1
4,sports,No!! NOO!!!!!,1
...,...,...,...
606471,video_games,No. It's probably only happened to you,3
606472,video_games,I think most of the disappointment came from t...,3
606473,video_games,"dishonored 1/2 looked like arse, so what the h...",3
606474,video_games,[removed],3


In [12]:
# Get the target name
from sklearn.model_selection import train_test_split

# Creating the features from the data set
features, target = sample_cmnts.stemmed, sample_cmnts.ncat

In [13]:
# Make test and training split (20:80)
features_train,features_test,target_train, target_test = train_test_split(features,target, random_state=0, test_size = 0.2)

print('Features-Training Set: ',len(features_train))
print('Features-Test Set: ',len(features_test))
print('Target: Training Set: ',len(target_train))
print('Target: Test Set: ',len(target_test))

Features-Training Set:  40000
Features-Test Set:  10000
Target: Training Set:  40000
Target: Test Set:  10000


In [14]:
# Train the model

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer

classifier = Pipeline([('vect', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                       ('ann', MLPClassifier(hidden_layer_sizes=[500,150,100],  max_iter=20, activation='relu',solver='adam',verbose=False))
])
clf = classifier.fit(features_train, target_train)

In [15]:
# Saving the Model

import joblib
from joblib import dump, load

model_path="/home/arindam/Documents/mygithub/bu_dsc/models"
model_name="NN_classifier_sklearn.pkl"
filename = model_path + "/" + model_name 
# print(filename)
joblib.dump(clf, filename)


['/home/arindam/Documents/mygithub/bu_dsc/models/NN_classifier_sklearn.pkl']

In [16]:
# Load a saved model

NN_clf = open(filename,'rb')

clf1 = joblib.load(NN_clf)


In [17]:
# Predicting the test set for the classifier
y_pred = clf1.predict(features_test)
# y_pred

In [18]:
# Displaying the result metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

print("Confusion Matrix")
print("================")
print(confusion_matrix(target_test,y_pred))
print("Classification Report")
print("=====================================================")

print(classification_report(target_test,y_pred))
print("Accuracy Score")
print("=====")

print(accuracy_score(target_test, y_pred))

Confusion Matrix
[[4302  691]
 [ 648 4359]]
Classification Report
              precision    recall  f1-score   support

           1       0.87      0.86      0.87      4993
           2       0.86      0.87      0.87      5007

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

Accuracy Score
=====
0.8661
