In [None]:
'''
Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using scikit-learn. Use the code found in chapter 12 of the Applied Text Analysis with Python book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.


In [None]:
#import json library to read data in jsonl file
import json
#import pandas library
import pandas as pd

In [None]:
#check versions of packages
print('pandas version:', pd.__version__)
# print('numpy version:', np.__version__)
# print('scikit-learn version:', sklearn.__version__)
# print('NLTK version:', nltk.__version__)

In [None]:
#read in the data as a dataframe
filename = "/home/arindam/Documents/mygithub/bu_dsc/data/raw/categorized-comments.jsonl"
with open(filename, 'r') as f:
    jsonl_list = list(f)

list1 = []
for obj in jsonl_list:
    res = json.loads(obj)
    list1.append(res)
    
comments = pd.DataFrame(list1)

#display the first few rows of data
comments.head()
# len(list1)


In [None]:
# print the dimension of the dataframe
print('The dataframe has a dimension of:',comments.shape)
print('It has {} comments'.format(comments.shape[0]))

In [None]:
print('The target names are :', comments['cat'].unique())
print('This shows that there are only 3 categories in the total dataset')

In [None]:
#Convert text to lowercase and romove punctuation
#define a function to clean the text
# import the required libraries here
#import regular expressions library
import re

def clean_text(text):
    """
    Remove punctuations and special characters, makes lower case
    Args: text
    Output: text
    """
    text=text.lower() #makes text lowercase
    text=re.sub('\\d|\\W+|_',' ',text) #removes extra white space
    text=re.sub('[^a-zA-Z]'," ", text) #removes any non-alphabetic characters
    return text

In [None]:
#import word tokenizer from NLTK
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords

def tokenize_text(txt):
    """
    Takes in a sentence, tokenizes the words into a list,
    """
    stop_words = stopwords.words('english')
    tokenizer = TreebankWordTokenizer()
    tokens = tokenizer.tokenize(txt)
    return [token for token in tokens if token not in stop_words]

In [None]:
#Apply NLTK's PorterStemmer
#define a function to stem the words
from nltk.stem.porter import PorterStemmer

def porter_stem_text(token_list):

    porter = PorterStemmer()
    return (" ".join (porter.stem(token) for token in token_list))

In [None]:
#Apply NLTK's WordNetLemmatizer
#define a function to lemmatize the words
from nltk.stem import WordNetLemmatizer

def lemmatize_text(token_list):

    lemmatizer = WordNetLemmatizer()
    return (" ".join (lemmatizer.lemmatize(token) for token in token_list))

In [None]:
# Testing the functions
sample_cmnts = comments[:5000]
# txt = "barely than significantly especially is an the ? better surrounded."
# sample_cmnts['cat'].unique()
# creating a dictionary to replace the string values to numeric
d = {'sports':1,'science_and_technology':2,'video_games':3}
sample_cmnts['ncat'] = sample_cmnts['cat'].map(d)
sample_cmnts

In [None]:
sample_cmnts['cleaned']=sample_cmnts['txt'].apply(clean_text)
sample_cmnts['tokenized']=sample_cmnts['cleaned'].apply(tokenize_text)
sample_cmnts['stemmed']=sample_cmnts['tokenized'].apply(porter_stem_text)
sample_cmnts

In [None]:
comments['cleaned']=comments['txt'].apply(clean_text)
comments['tokenized']=comments['cleaned'].apply(tokenize_text)
comments['stemmed']=comments['tokenized'].apply(porter_stem_text)
comments

In [None]:
comments['cat'].unique()
# creating a dictionary to replace the string values to numeric
d = {'sports':1,'science_and_technology':2,'video_games':3}
comments['ncat'] = comments['cat'].map(d)
comments

In [None]:
# Get the traget name
from sklearn.model_selection import train_test_split

# Creating the features from the data set
features, target = sample_cmnts.stemmed, sample_cmnts.ncat

In [None]:
# Make test and training split
features_train,features_test,target_train, target_test = train_test_split(features,target, random_state=0, test_size = 0.2)

print('Features-Training Set: ',len(features_train))
print('Features-Test Set: ',len(features_test))
print('Target: Training Set: ',len(target_train))
print('Target: Test Set: ',len(target_test))

In [None]:
# Extracting features from text files
'''
Text files are actually series of words(ordered). In order to run ML algorithms we need to convert the text files into numerical feature vectors. We will use "bag of words" model.
Each unique word in our dictionary will correspond to a feature.
'''
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_features_train = count_vect.fit_transform(features_train)
'''
Using count_vect.fit_transform, we are learning the vocabulary dictionary and it returns a Document-Term matrix
'''
# X_features_train.shape
print('Shape of the feature set:', X_features_train.shape)

In [None]:
'''
TF (Term Frequency) helps in avoiding the issue with giving more weight to longer documents than shorter documents. count(words) / Total words (in each document)
TF-IDF even reduces the weightage of more common words in documents (e.g., the, is an etc)
'''
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_features_train_tfidf = tfidf_transformer.fit_transform(X_features_train)
print('Shape of the tfidf feature matrix:', X_features_train_tfidf.shape)

In [None]:
df_feature=pd.DataFrame(
    y_target_train.todense(),
    columns = count_vect.get_feature_names()
)
df_feature

In [None]:

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer


regressor = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('ann', MLPRegressor(hidden_layer_sizes=[500,150], verbose=True)),
                      ])
regressor = regressor.fit(features_train, target_train)
# predicted_regressor = regressor.predict(features_train)
# np.mean(predicted_regressor == target_train)

In [None]:
classifier = Pipeline([('vect', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                       ('ann', MLPClassifier(hidden_layer_sizes=[500,150], verbose=True))
])
clf = classifier.fit(features_train, target_train)

In [None]:
# Predicting the test set
y_pred = regressor.predict(features_test)
y_pred

In [None]:
# Predicting the test set for the classifier
y_pred = clf.predict(features_test)
y_pred

In [None]:
# Displaying the result metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

print(confusion_matrix(target_test,y_pred))
print(classification_report(target_test,y_pred))
print(accuracy_score(target_test, y_pred))