In [4]:
#import json library to read data in jsonl file
import json
#import pandas library
import pandas as pd
#import numpy library
import numpy as np
#import regular expressions library
import re
#import nltk
import nltk
#import stopwords from NLTK
from nltk.corpus import stopwords
#import word tokenizer from NLTK
from nltk.tokenize import word_tokenize
#import Part-of-Speech tagger
from nltk import pos_tag
#import sklearn
import sklearn
#import word count vectorizer from sklearn
from sklearn.feature_extraction.text import CountVectorizer
#import tf-idf vectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
#import the one-hot encoding package from sklearn
from sklearn.preprocessing import MultiLabelBinarizer

In [5]:
#check versions of packages
print('pandas version:', pd.__version__)
print('numpy version:', np.__version__)
print('scikit-learn version:', sklearn.__version__)
print('NLTK version:', nltk.__version__)

pandas version: 1.2.4
numpy version: 1.19.2
scikit-learn version: 0.23.2
NLTK version: 3.5


## Read in the Data

In [6]:
#read in the data as a dataframe
filename = "~/Documents/mygithub/bu_dsc/data/raw/controversial-comments.jsonl"
df_all = pd.read_json(filename, lines = True)
#display the first few rows of data
df_all.head()

Unnamed: 0,con,txt
0,0,Well it's great that he did something about th...
1,0,You are right Mr. President.
2,0,You have given no input apart from saying I am...
3,0,I get the frustration but the reason they want...
4,0,I am far from an expert on TPP and I would ten...


## Preprocessing the text

In [7]:
#Parts A and B: Convert text to lowercase and romove punctuation
#define a function to clean the text
def clean_text(text):
    """
    Remove punctuations and special characters, makes lower case
    Args: text
    Output: text
    """
    text=text.lower() #makes text lowercase
    text=re.sub('\\d|\\W+|_',' ',text) #removes extra white space
    text=re.sub('[^a-zA-Z]'," ", text) #removes any non-alphabetic characters
    return text

In [8]:
#Part C: Remove stop words (and tokenize)
#define a function to tokenize the text and remove stop words
#use the nltk package for tokenizing and removing stop words
#Note: You may have to run this next commmand to download the NLTK 'punkt' library for the first time
#nltk.download('punkt')
#Note: You may need to run this next command to download stopwords for the first time
#nltk.download('stopwords')

def tokenize_and_remove_stop_words(txt):
    """
    takes in a sentence, tokenizes the words into a list,
    and then removes stop words from the tokenized list
    """
    stop_words = stopwords.words('english')
    txt_token = word_tokenize(txt)
    txt_no_stopwords = [word for word in txt_token if word not in stop_words]
    return txt_no_stopwords

In [9]:
#Part D: Apply NLTK's PorterStemmer
#define a function to stem the words
from nltk.stem.porter import PorterStemmer

def stem_text(word_list):

    porter = PorterStemmer()
    return [porter.stem(word) for word in word_list]

In [10]:
#take a random sample of the dataframe to cut down on processing time
#number of comments to keep
num_comments = 50000
df_sample = df_all.sample(n = num_comments).reset_index(drop = True )
df_sample.head()

Unnamed: 0,con,txt
0,0,"Nothing or Anyone Bill Clinton has done, is do..."
1,0,[deleted]
2,0,Consarnit!
3,0,Not only bringing balance to the force but bri...
4,0,I see you're a frequent poster on t_d. Lovely ...


In [11]:
#create a new data frame for the column for each pre-processing step
#apply text cleaning function
df_sample['txt_clean'] = df_sample['txt'].apply(clean_text)
#apply tokenizing/removing stop words function
df_sample['txt_tokenized'] = df_sample['txt_clean'].apply(tokenize_and_remove_stop_words)
#apply PorterStemmer function
df_sample['txt_stemmed'] = df_sample['txt_tokenized'].apply(stem_text)
#put the text back together (untokenize)
df_sample['txt_final'] = df_sample['txt_stemmed'].apply(lambda text: ' '.join(text))
#view the pre-processed text
df_sample.head()

Unnamed: 0,con,txt,txt_clean,txt_tokenized,txt_stemmed,txt_final
0,0,"Nothing or Anyone Bill Clinton has done, is do...",nothing or anyone bill clinton has done is doi...,"[nothing, anyone, bill, clinton, done, bearing...","[noth, anyon, bill, clinton, done, bear, want,...",noth anyon bill clinton done bear want shape n...
1,0,[deleted],deleted,[deleted],[delet],delet
2,0,Consarnit!,consarnit,[consarnit],[consarnit],consarnit
3,0,Not only bringing balance to the force but bri...,not only bringing balance to the force but bri...,"[bringing, balance, force, bringing, peace, de...","[bring, balanc, forc, bring, peac, decad, old,...",bring balanc forc bring peac decad old feud or...
4,0,I see you're a frequent poster on t_d. Lovely ...,i see you re a frequent poster on t d lovely p...,"[see, frequent, poster, lovely, person]","[see, frequent, poster, love, person]",see frequent poster love person


In [12]:
#check the dimensions of the dataframe
df_sample.shape

(50000, 6)

Note that the dataframe has 50,000 rows (comments). Because we want to make a prediction about each comment,our input for modeling should also have 50,000 rows. A good way to check to make sure the following steps are working properly is by checking the dimensions of the ouput array, and we know it should have 50,000 rows. If there are not 50,000 rows, something was not done correctly.

## Apply the Word Count Vectorizer

In [13]:
#create the bag of words feature matrix
count = CountVectorizer()
bag_of_words = count.fit_transform(df_sample['txt_final'])

In [14]:
#check the shape of the output
bag_of_words.shape

(50000, 24354)

In [15]:
# Note that there are 50,000 rows as expected, and the 24,080 columns correspond to the unique words in the comments.

## Apply the TFIDF Vectorization

In [16]:
#define a function to get the tf-idf vectorization
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df_sample['txt_final'])

In [17]:
#check the shape of the output
tfidf_matrix.shape

(50000, 24354)

Note that the TFIDF matrix has the same shape as the word count vector. This makes sense as the number of columns in both matrices corresponds to the
number of unique words. Whereas the word count vectorizer is only counting the number of times the word appears the TFIDF vectorization weighs how
important each word is in each comment.

## Part-of-Speech Tagging

In [18]:
#apply the part-of-speech tagging
#note we are applying this to the tokenized comments
pos_matrix = df_sample['txt_tokenized'].apply(pos_tag)

In [19]:
#print the first non stemmed/tokenized comment
print(df_sample['txt_final'][1])
#show the first entry of the part-of-speech matrix
print(pos_matrix[1])

delet
[('deleted', 'VBN')]


Note that it takes in the tokenized comment and attaches a part-of-speech to it. E.g., 'NN' is a noun and 'JJ' is an adjective. There was some discussion of
whether the part-of-speech tagging should be applied to the non-stemmed text as stemming can change the part-of-speech. So I did the POS tagging to the non stemmed words.But keep this in mind.

This matrix is not numerical and thus cannot be used as input to a model. To complete the process, we will apply one-hot encoding to this matrix to be used for input to a model. Again, the number of rows in this matrix should still be 50,000 but the number of columns will correspond to each different part-of-speech in the comments.

In [20]:
#get the tags only for the one-hot-encoding
tags = []
for pos_tag in pos_matrix:
    tags.append([tag for word, tag in pos_tag])

In [21]:
#Let's display what this did
#print the first entry in the part-of-speech matrix
print('Words and Parts-of-Speech:', pos_matrix[1])
#print the first entry in tags
print('Parts-of-Speech Only:', tags[1])

Words and Parts-of-Speech: [('deleted', 'VBN')]
Parts-of-Speech Only: ['VBN']


In [22]:
#initialize the one-hot encoder
one_hot_multi = MultiLabelBinarizer()
#one-hot encode the pos tags
pos_num_matrix = one_hot_multi.fit_transform(tags)

In [23]:
#Let's looks at the shape and classes of the output matrix
print(pos_num_matrix.shape)
print(one_hot_multi.classes_)

(50000, 36)
["''" 'CC' 'CD' 'DT' 'EX' 'FW' 'IN' 'JJ' 'JJR' 'JJS' 'MD' 'NN' 'NNP'
 'NNPS' 'NNS' 'PDT' 'POS' 'PRP' 'PRP$' 'RB' 'RBR' 'RBS' 'RP' 'SYM' 'TO'
 'UH' 'VB' 'VBD' 'VBG' 'VBN' 'VBP' 'VBZ' 'WDT' 'WP' 'WP$' 'WRB']


Again, as expected, we have 50,000 rows. There are 37 columns corresponding the different parts-of-speech appearing in the comments. The classes shown correspond to each of the columns in the part-of-speech numerical matrix.