In [1]:
# Ramsey King
# DSC 550 - Data Mining
# November 7, 2021
# Exercise 9.3

In [2]:
import pandas as pd
import numpy as np


data = pd.read_json('categorized-comments.jsonl', lines=True)
data.head(1)



Unnamed: 0,cat,txt
0,sports,Barely better than Gabbert? He was significant...


In [3]:
category_list = list(set(data['cat']))
category_list

['science_and_technology', 'sports', 'video_games']

1. Neural Network Classifier with Scikit

Using the multi-label classifier dataset (categorized-comments.jsonl), fit a neural network classifier using scikit-learn to predict the comment category. Use the code found in chapter 12 of the Applied Text Analysis with Python book as a guide, but you will need to modify the code for this dataset. Report the accuracy, precision, recall, F1-score, and confusion matrix.

In [5]:

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
stop_words = set(stopwords.words('english'))


def documents(corpus_text):
    # retrieves the pickled, part-of-speech tagged documents from our corpus reader object
    # Need to make a column that is part-of-speech tagged
    tokenized = sent_tokenize(corpus_text)
    for i in tokenized:
      
        # Word tokenizers is used to find the words 
        # and punctuation in a string
        wordsList = nltk.word_tokenize(i)
    
        # removing stop words from wordList
        # wordsList = [w for w in wordsList if not w in stop_words] 
        # wordsList = [w for w in wordsList] 
    
        #  Using a Tagger. Which is part-of-speech 
        # tagger or POS-tagger. 
        tagged = nltk.pos_tag(wordsList)
  
        return tagged

def continuous(corpus):
    # to get the numeric ratings of each album
    return list(corpus.scores())

def make_categorical(corpus_cat):
    cat_dictionary = {
        'science_and_technology': 1,
        'sports': 2,
        'video_games': 3,
    }

    return cat_dictionary.get(corpus_cat)
    
data['cat_num'] = data['cat'].apply(make_categorical)
data['tokenized_txt'] = data['txt'].apply(lambda x: documents(x))
print(data['tokenized_txt'][:-10])


0         [(Barely, RB), (better, JJR), (than, IN), (Gab...
1         [(Fuck, IN), (the, DT), (ducks, NNS), (and, CC...
2         [(Should, MD), (have, VB), (drafted, VBN), (mo...
3         [([, NN), (Done, NNP), (], NNP), ((, (), (http...
4                                [(No, DT), (!, .), (!, .)]
                                ...                        
606461                                       [(touche, NN)]
606462    [(Not, RB), (me, PRP), (,, ,), (I, PRP), (coul...
606463    [(you, PRP), (obviously, RB), (do, VBP), (n't,...
606464    [(I, PRP), ('m, VBP), (not, RB), (an, DT), (an...
606465    [(Does, VBZ), (it, PRP), (have, VB), (gnomes, ...
Name: tokenized_txt, Length: 606466, dtype: object


We will get the scores required to satsify the exercise requirements:

In [None]:
from sklearn.externals import joblib
from sklearn.model_selection import cross_val_score

def train_model(path, model, continuous=True, saveto=None, cv=12):
    """
    Trains model from corpus at specified path; constructing cross-validation
    scores using the cv parameter, then fitting the model on the full data.
    Returns the scores.
    """
    # Load the corpus data and labels for classification
    corpus = PickledReviewsReader(path)
    X = documents(corpus)
    if continuous:
        y = continuous(corpus)
        scoring = 'r2_score'
    else:
        y = make_categorical(corpus)
        scoring = 'f1_score'

    # Compute cross-validation scores
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)

    # Write to disk if specified
    if saveto:
        joblib.dump(model, saveto)

    # Fit the model on entire dataset
    model.fit(X, y)

    # Return scores
    return scores

In [None]:

from transformer import TextNormalizer
from reader import PickledReviewsReader

from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# Path to postpreprocessed, part-of-speech tagged review corpus
cpath = data['tokenized_txt']

regressor = Pipeline([
    ('norm', TextNormalizer()),
    ('tfidf', TfidfVectorizer()),
    ('ann', MLPRegressor(hidden_layer_sizes=[500,150], verbose=True))
])
regression_scores = train_model(cpath, regressor, continuous=True)

classifier = Pipeline([
    ('norm', TextNormalizer()),
    ('tfidf', TfidfVectorizer()),
    ('ann', MLPClassifier(hidden_layer_sizes=[500,150], verbose=True))
])
classifer_scores = train_model(cpath, classifier, continuous=False)