In [None]:
'''
2. Neural Network Classifier with Keras
Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using Keras. Use the code found in chapter 12 of the Applied Text Analysis with Python book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.
'''

In [5]:
# Training a multiclass Classifier
# Using Keras to construct a feedforward neural network with an output layer with soft-max activation functions

# Load libraries
import numpy as np
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras import models
from keras import layers

#import json library to read data in jsonl file
import json
#import pandas library
import pandas as pd

In [6]:
#read in the data as a dataframe
filename = "/home/arindam/Documents/mygithub/bu_dsc/data/raw/categorized-comments.jsonl"
with open(filename, 'r') as f:
    jsonl_list = list(f)

list1 = []
for obj in jsonl_list:
    res = json.loads(obj)
    list1.append(res)
    
comments = pd.DataFrame(list1)

#display the first few rows of data
comments.head()
# len(list1)

Unnamed: 0,cat,txt
0,sports,Barely better than Gabbert? He was significant...
1,sports,Fuck the ducks and the Angels! But welcome to ...
2,sports,Should have drafted more WRs.\n\n- Matt Millen...
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg)
4,sports,No!! NOO!!!!!


In [7]:
#Convert text to lowercase and romove punctuation
#define a function to clean the text
# import the required libraries here
#import regular expressions library
import re

def clean_text(text):
    """
    Remove punctuations and special characters, makes lower case
    Args: text
    Output: text
    """
    text=text.lower() #makes text lowercase
    text=re.sub('\\d|\\W+|_',' ',text) #removes extra white space
    text=re.sub('[^a-zA-Z]'," ", text) #removes any non-alphabetic characters
    return text

In [81]:
# Cleaning texts in the comments
# Using the transformed column for the model
# Testing the functions

sample_size = 5000
sample_cmnts = comments[:sample_size]
sample_cmnts['cleaned']=sample_cmnts['txt'].apply(clean_text)
# creating a dictionary to replace the string values to numeric
d = {'sports':1,'science_and_technology':2,'video_games':3}
sample_cmnts['ncat'] = sample_cmnts['cat'].map(d)
sample_cmnts.head

<bound method NDFrame.head of                          cat  \
0                     sports   
1                     sports   
2                     sports   
3                     sports   
4                     sports   
...                      ...   
4995  science_and_technology   
4996  science_and_technology   
4997  science_and_technology   
4998  science_and_technology   
4999  science_and_technology   

                                                    txt  \
0     Barely better than Gabbert? He was significant...   
1     Fuck the ducks and the Angels! But welcome to ...   
2     Should have drafted more WRs.\n\n- Matt Millen...   
3               [Done](https://i.imgur.com/2YZ90pm.jpg)   
4                                         No!! NOO!!!!!   
...                                                 ...   
4995  I just recently bought a tv with chromecast bu...   
4996  SMS just works, is reliable. Everyone I know h...   
4997                                          [deleted

In [82]:
sample_cmnts.ncat.value_counts()

1    3556
2    1444
Name: ncat, dtype: int64

In [83]:
output_activation=sample_cmnts.ncat.unique()[1]
output_activation

2

In [84]:
# Get the target name
from sklearn.model_selection import train_test_split

# Creating the features from the data set
features, target = sample_cmnts.cleaned, sample_cmnts.ncat

In [85]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

number_of_features = 5000

tfidf = TfidfVectorizer(max_features = number_of_features, stop_words=stopwords.words('english'))

features = tfidf.fit_transform(features).toarray()
features.shape

(5000, 5000)

In [86]:
# Make test and training split (30:70)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train, y_test = train_test_split(features,target, random_state=0, test_size = 0.3)

In [72]:
# X_train.shape
# X_test.shape
# y_train.shape
# y_test.shape

(35000,)

In [93]:
# Start Neural Network
network = models.Sequential()

In [94]:
# Adding fully connected input layer with a ReLU activation function
network.add(layers.Dense(units=100,
                         activation='relu',
                         input_shape=(number_of_features,)))

In [95]:
# Add fully connected layer with ReLU activation function
network.add(layers.Dense(units=200, activation="relu"))

In [96]:
# Add fully connected layer with a softmax activation function
network.add(layers.Dense(units=output_activation-1, activation="softmax"))

In [97]:
# Compile neural network
network.compile(loss="categorical_crossentropy", # Cross-entropy
                optimizer="rmsprop", # Root Mean Square Propagation
                metrics=["accuracy"]) # Accuracy performance metric

In [98]:
# Train neural network
history = network.fit(X_train, # Features
                      y_train, # Target
                      epochs=3, # Three epochs
                      verbose=1, # show output
                      batch_size=100, # Number of observations per batch
                      validation_data=(X_test, y_test)) # Test data

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [99]:
# Saving the Model

model_path="/home/arindam/Documents/mygithub/bu_dsc/models"
model_name="NN_keras.h5"
filename = model_path + "/" + model_name 
# print(filename)
network.save(filename)

In [100]:
# Load a saved model
from keras.models import load_model
NN_clf = load_model(filename)


In [101]:
# Predicting the test set for the classifier
y_pred = NN_clf.predict(X_test)
# y_pred

In [102]:
# Displaying the result metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

print("Confusion Matrix")
print("================")
print(confusion_matrix(y_test,y_pred))
print("Classification Report")
print("=====================================================")

print(classification_report(y_test,y_pred))
print("Accuracy Score")
print("=====")

print(accuracy_score(y_test, y_pred))

Confusion Matrix
[[1038    0]
 [ 462    0]]
Classification Report
              precision    recall  f1-score   support

           1       0.69      1.00      0.82      1038
           2       0.00      0.00      0.00       462

    accuracy                           0.69      1500
   macro avg       0.35      0.50      0.41      1500
weighted avg       0.48      0.69      0.57      1500

Accuracy Score
=====
0.692
