# HW 2 - PROGRAMMING EXERCISES - Binary Classification on Text Data

## Import Packages

In [54]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import string
import seaborn as sns
# import nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import word_tokenize, pos_tag
# import sklearn
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

## (a) Download the data
I downloaded the data set and successfully imported it by implementing following code:

(1) how many training and test data points are there? 

7613 in the trainning set and 3263 in the testing set. 

In [55]:
# load datasets
df_test  = pd.read_csv("test.csv");
df_train = pd.read_csv("train.csv");

# check data
df_train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [56]:
# check data
df_test.head(5)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [57]:
# basic insepction 
print(df_train.shape[0], df_test.shape[0])

7613 3263


(2) what percentage of the training tweets are of real disasters, and what percentage is not?

42.97% are of real disasters, 57.03% are not.

In [58]:
# percentage of the trainning tweets are of real disasters
df_train.loc[df_train['target'] == 1].shape
percentage_real = df_train.loc[df_train['target'] == 1].shape[0] / df_train.shape[0]
print("Percentage of the training tweets are of real disasters: " + str(percentage_real * 100) + "%.")

Percentage of the training tweets are of real disasters: 42.96597924602653%.


In [59]:
# percentage of the trainning tweets are of not real disasters
percentage_fake = df_train.loc[df_train['target'] == 0].shape[0] / df_train.shape[0]
print("Percentage of the training tweets are NOT of real disasters: " + str(percentage_fake * 100) + "%.")

Percentage of the training tweets are NOT of real disasters: 57.03402075397347%.


## (b) Split the training data

In [60]:
# Randomly choose 70% of the data points in the training data as the training set
# and the remaining 30% of the data as the development set
train, dev = train_test_split(df_train, test_size=0.3)
train.shape, dev.shape

((5329, 5), (2284, 5))

## (c) Preprocess the data
The data sets contain significant amounts of noise and unprocessed content. Data cleaning and pre-processing methods would be applied. I explained the reasons for each of my decision (why or why not) in the following paragraph: 
* Convert all the words to lowercase: Yes. I think whether the word is lowercase or uppercase doesn't contribute to the classification in the context of our problem. And users (especially Twitter users) may not strictly follow the grammar and use lowercase and uppercase arbitrarily. In this sense, building the vector based on cases is meaningless and makes our operations inefficient. I chose to convert all the words to lowercase. 
* Lemmatize all the words: Yes. I think the tense of words doesn't contribute much to the classification in the context of our problem. So I lemmatized all the words based on their POS tags using WordNetLemmatizer from the nltk library. I didn't use nltk.stem since stemming may create the non-existence meaning of a word (eg. from "causes" to "caus"). But lemmatization always gives the dictionary meaning word while converting into root-form. 
* Strip punctuation: Yes. I think punctuation barely contributes to our analysis of text. Users' choices of punctuation don't impact the meaning of their tweets. So I stripped punctuation using the Python built-in library. 
* Strip the stop words, e.g., “the”, “and”, “or”: Yes. Stop words don't not add much information to the text and should be filtered out.
* Strip @ and urls. (It’s Twitter.): Yes for ulrs, and No for @. I stripped urls since they are references to a location on the web, but do not provide any additional information. It's meaningless to do NLP and get conclusion from these non-semantic urls. And urls are lengthy, thus causing inefficiency while getting bag of words. I use the re library that provides regular expression matching operations to strip them. I chose not to strip @ since mention of certain users may contribute the our classification problem. For instance, it's possible that mentioning @earthquakeBot may be a feature that contribute to tweets that are real disasters since the followers of this account closely follow this topic (this is just an assumption). 
* Something else: Strip the HTML tags - I found some HTML tags like &amp in the text. They do not add any value to text data and only enable proper browser rendering. So I removed them using re library. 

In [61]:
# import packages from nltk and re
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import re # Regex library
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')

# Get pos tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
# Lemmatize all words
def Lemmatize_Sentence(sentence):
    res = []
    lemmatizer = WordNetLemmatizer()
    for word, pos in pos_tag(word_tokenize(sentence)):
        wordnet_pos = get_wordnet_pos(pos) or wordnet.NOUN
        res.append(lemmatizer.lemmatize(word, pos=wordnet_pos))
    return " ".join(res)

# Strip Punctuation
def Strip_Punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

# Strip Stop words
stopword = stopwords.words('english')
def Strip_StopWord(string):
    string_list = string.split()
    return ' '.join(i for i in string_list if i not in stopword)

# Strip Urls
def Strip_Url(string):
    return re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%|\-)*\b', '', string)

#  Strip HTML Tags
def Strip_HTML(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ryleeli/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /Users/ryleeli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ryleeli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [62]:
# Convert all the words to lowercase, then apply all the above processing methods
train['text'] = train['text'].str.lower().apply(Lemmatize_Sentence).apply(Strip_Punct).apply(Strip_StopWord).apply(Strip_Url).apply(Strip_HTML)
dev['text'] = dev['text'].str.lower().apply(Lemmatize_Sentence).apply(Strip_Punct).apply(Strip_StopWord).apply(Strip_Url).apply(Strip_HTML)

## (d) Bag of words model
I chose threshold $M = 3$ to avoid run-time and memory issues, and to avoid noisy/unreliable features that can hurt learning. Intuitively, I was thinking about $M\in[2, 10]$. If $M$ was too small, a run-time and memory issues may occur since we may have very high-dimensional vectors. And too many features may cause over-fitting and bad generalizes. If $M$ was too large, fewer words/features were selected, thus leading to a poor-performed model that cannot generalize well to new data. Then I ran a test for $M\in[2, 10]$ with the implementation of regularized logistic regression, and I decided to choose $M = 3$ as it reflects the highest F1-score. 

In [93]:
# Choose threshold to avoid run-time and memory issues, and to avoid noisy/unreliable features that can hurt learning
threshold = 3
# Build the bag of words feature vectors
def cv(data):
    count_vectorizer = CountVectorizer(min_df = threshold, binary=True) 
    emb = count_vectorizer.fit_transform(data)
    return emb, count_vectorizer
# Tokenizing the pre-processed clean texts
train['tokenized'] = train["text"].tolist()
dev['tokenized'] = dev["text"].tolist()
# Build the bag of words feature vectors for both the training and development sets
train_bag_of_words, train_cv = cv(train['tokenized'])
dev_bag_of_words = train_cv.transform(dev['tokenized'])
train_words = train_cv.get_feature_names()
# Report the total number of features in these vectors
print("Total number of features in these vectors: "+ str(len(train_words)))

Total number of features in these vectors: 2917


## (e) Logistic regression
### (e.i) Logistic regression without regularization terms
I observed issues with overfitting. This is a very expressive model that fits the training dataset perfectly but makes highly incorrect predictions outside this dataset, and doesn't generalize.

In [115]:
# Train a logistic regression model without regularization terms
LR0_Model = LogisticRegression(penalty='none')
LR0_Model.fit(train_bag_of_words, train['target'])
train_predicted_LR0 = LR0_Model.predict(train_bag_of_words)
dev_predicted_LR0 = LR0_Model.predict(dev_bag_of_words)
#  Report the F1 score in the training and in the development set
print("Logistic regression without regularization terms - F1 Score on training set: " + str(f1_score(train_predicted_LR0, train['target'])))
print("Logistic regression without regularization terms - F1 Score on development set: " + str(f1_score(dev_predicted_LR0, dev['target'])))

Logistic regression without regularization terms - F1 Score on training set: 0.9804177545691906
Logistic regression without regularization terms - F1 Score on development set: 0.68407835258664


### (e.ii) Logistic regression with L1 regularization

In [116]:
# Train a logistic regression model with L1 regularization
LR_L1_Model = LogisticRegression(penalty='l1', solver='liblinear')
LR_L1_Model.fit(train_bag_of_words, train['target'])
train_predicted_LR_L1 = LR_L1_Model.predict(train_bag_of_words)
dev_predicted_LR_L1 = LR_L1_Model.predict(dev_bag_of_words)
#  Report the F1 score in the training and in the development set
print("Logistic regression with L1 regularization - F1 Score on training set: " + str(f1_score(train_predicted_LR_L1, train['target'])))
print("Logistic regression with L1 regularization - F1 Score on development set: " + str(f1_score(dev_predicted_LR_L1, dev['target'])))

Logistic regression with L1 regularization - F1 Score on training set: 0.8439814814814816
Logistic regression with L1 regularization - F1 Score on development set: 0.7417366946778711


### (e.iii) Logistic regression with L2 regularization

In [117]:
# Train a logistic regression model with L2 regularization
LR_L2_Model = LogisticRegression(penalty='l2', solver='liblinear')
LR_L2_Model.fit(train_bag_of_words, train['target'])
train_predicted_LR_L2 = LR_L2_Model.predict(train_bag_of_words)
dev_predicted_LR_L2 = LR_L2_Model.predict(dev_bag_of_words)
#  Report the F1 score in the training and in the development set
print("Logistic regression with L2 regularization - F1 Score on training set: " + str(f1_score(train_predicted_LR_L2, train['target'])))
print("Logistic regression with L2 regularization - F1 Score on development set: " + str(f1_score(dev_predicted_LR_L2, dev['target'])))

Logistic regression with L2 regularization - F1 Score on training set: 0.8846855059252506
Logistic regression with L2 regularization - F1 Score on development set: 0.7472283813747228


### (e.iv) Which one of the three classifiers performed the best on your training and development set? Did you observe any overfitting and did regularization help reduce it?

The logistic regression without regularization performed the best on our development set. The logistic regression with L2 regularization performed the best on our development set. 
* Logistic regression model with L2 regularization performed the best on the training and development set with a F1-score of 0.8846 on the training set and a F1-score of 0.7472 on the development set. This model reflects the highest F1-score on the development set and reduces the problem of over-fitting with regularization. 
* Yes, I observed issues of over-fitting while implementing logistic regression model without regularization terms. The model reflects a F1-score of 0.9804 on the training set and a F1-score of 0.6841 on the development set. It fits the training dataset perfectly with a high F1-score but performs poor predictions outside the training set (ie. the development set).
* Both L1 regularization and L2 regularization help reduce the over-fitting. After regularization, the F1-score on development set increases from 0.6841 to 0.7417, 0.7472 respectively. 

### (e.v) Inspect the weight vector of the classifier with L1 regularization 

In [119]:
# Inspect the weight vector of the classifier with L1 regularization 
coefficients = pd.concat([pd.DataFrame(train_words, columns=['words']),
                          pd.DataFrame(np.transpose(np.abs(LR_L1_Model.coef_)), columns=['coef'])], axis=1)
# Sort the coefficients
coefficients = coefficients.sort_values(by=['coef'], ascending=False)
# List the most important words for deciding whether a tweet is about areal disaster or not
print(coefficients.head(20))

           words      coef
2695     typhoon  3.562875
2404       spill  3.343851
2830    wildfire  3.187095
837   earthquake  3.179425
731   derailment  3.154386
1234   hiroshima  3.083647
1625     migrant  2.722774
2254     selfies  2.699474
1571    massacre  2.692738
2725      usagov  2.541668
1616       mh370  2.455677
759          dig  2.446691
843        ebola  2.438449
2767     volcano  2.364489
1834    outbreak  2.352875
2325    sinkhole  2.302689
1701    murderer  2.226004
697       debris  2.219851
2415        stab  2.204481
645         crew  2.183769


## (f) Bernoulli Naive Bayes
I implemented a Bernoulli Naive Bayes classifier with additive smoothing to predict the probability of whether each tweet is about a real disaster. 

In [129]:
# Compute the maximum likelihood model parameters on our dataset
n = train_bag_of_words.shape[0] # size of the dataset
d = train_bag_of_words.shape[1] # number of features in our dataset
K = 2 # number of clases

# # these are the shapes of the parameters
psis = np.zeros([K,d])
phis = np.zeros([K])
alpha = 1 # additive smoothing 

# we now compute the parameters
for k in range(K):
    X_k = train_bag_of_words[train['target'] == k]
    # psis[k] = np.mean(X_k, axis=0)
    psis[k] = (X_k.sum(axis=0) + alpha) / (2 * alpha + X_k.shape[0])
    phis[k] = (X_k.shape[0]) / (float(n))

# print out the class proportions
print(phis)

[0.56671045 0.43328955]


In [131]:
# Compute predictions using Bayes' rule and implement the model in numpy
def nb_predictions(x, psis, phis):
    """This returns class assignments and scores under the NB model.
    We compute \arg\max_y p(y|x) as \arg\max_y p(x|y)p(y)
    """
    # adjust shapes
    n, d = x.shape
    x = np.reshape(x, (1, n, d))
    psis = np.reshape(psis, (K, 1, d))
    
    # clip probabilities to avoid log(0)
    psis = psis.clip(1e-14, 1-1e-14)
    
    # compute log-probabilities
    logpy = np.log(phis).reshape([K,1])
    logpxy = x * np.log(psis) + (1-x) * np.log(1-psis)
    logpyx = logpxy.sum(axis=2) + logpy

    return logpyx.argmax(axis=0).flatten(), logpyx.reshape([K,n])

In [133]:
# Train this classifier on the training set
idx_train, logpyx_train = nb_predictions(train_bag_of_words.toarray(), psis, phis)
idx_dev, logpyx_dev = nb_predictions(dev_bag_of_words.toarray(), psis, phis)
# Report its F1-score on the development set
print("Bernoulli Naive Bayes - F1 Score on training set: " + str(f1_score(idx_train, train['target'])))
print("Bernoulli Naive Bayes - F1 Score on development set: " + str(f1_score(idx_dev, dev['target'])))

Bernoulli Naive Bayes - F1 Score on training set: 0.8105163429654192
Bernoulli Naive Bayes - F1 Score on development set: 0.7416378316032295


## (g) Model Comparsion
Check Report for this section.

## (h) N-gram model
I chose threshold $M = 6$. If $M$ was too small, a run-time and memory issues may occur since we may have very high-dimensional vectors, especially when building a n-gram model. And too many features may cause over-fitting and bad generalizations. If $M$ was too large, fewer words/features (especially bi-grams) were selected, thus leading to a poor-performed model that cannot generalize well to new data. Then I ran a test for $M\in[2, 10]$. I decided to choose $M = 6$ as it obtains 2112 words in the vocabulary (1-grams and 2-grams), which is closed to the total number of words in the vocabulary when we built the bag of words for previous models (ie. 2917). As we built bags of words with similar size, the comparison between models would be more insightful. 

In [240]:
# Choose threshold to avoid run-time and memory issues
# and to avoid noisy/unreliable features that can hurt learning
threshold = 6
# Build the bag of words feature vectors
def ngram_cv(data):
    count_vectorizer = CountVectorizer(min_df = threshold, binary=True, ngram_range=(1,2)) 
    emb = count_vectorizer.fit_transform(data)
    return emb, count_vectorizer

In [256]:
# Build the bag of words feature vectors for both the training and development sets
train_ngram_bag_of_words, train_ngram_cv = ngram_cv(train['tokenized'])
dev_ngram_bag_of_words = train_ngram_cv.transform(dev['tokenized'])
train_ngram_words = train_ngram_cv.get_feature_names()
# Report the total number of 1-grams and 2-grams in the vocabulary.
print("Total number of 1-grams and 2-grams in the vocabulary: "+ str(train_ngram_bag_of_words.shape[1]))

# count 1-gram and 2-gram respectively 
one_gram = 0 
two_gram = 0 

for word in train_ngram_words:
    if ' ' in word:
        two_gram += 1
    else:
        one_gram += 1 
# print out the result       
print('The total number of 1-gram is:', one_gram) 
print('The total number of 2-grams is:', two_gram)

Total number of 1-grams and 2-grams in the vocabulary: 2112
The total number of 1-gram is: 1570
The total number of 2-grams is: 542


In [244]:
# Take 10 2-grams from your vocabulary, and print them out
k = 0
i = 0
bigram = [0 for i in range(10)] 
while k < 10 and i < len(train_ngram_words):
    words = train_ngram_words[i]
    wordslist = words.split(" ")
    if len(wordslist) == 2:
        bigram[k] = words
        k += 1
    i += 1
print(bigram)

['11yearold boy', '12000 nigerian', '15 saudi', '16yr old', '1980 http', '2015 http', '3g whole', '40 family', '5km volcano', '70 year']


### Logistic Regression with L2 Regularization

In [249]:
# Train a logistic regression model with L2 regularization
LR_L2_ngram_Model = LogisticRegression(penalty='l2', solver='liblinear')
LR_L2_ngram_Model.fit(train_ngram_bag_of_words, train['target'])
train_predicted_LR_L2_ngram = LR_L2_ngram_Model.predict(train_ngram_bag_of_words)
dev_predicted_LR_L2_ngram = LR_L2_ngram_Model.predict(dev_ngram_bag_of_words)
#  Report the F1 score in the training and in the development set
print("Logistic regression with L2 regularization - N gram - F1 Score on training set: " 
          + str(f1_score(train_predicted_LR_L2_ngram, train['target'])))
print("Logistic regression with L2 regularization - N gram - F1 Score on development set: " 
          + str(f1_score(dev_predicted_LR_L2_ngram, dev['target'])))

Logistic regression with L2 regularization - N gram - F1 Score on training set: 0.8554382744378155
Logistic regression with L2 regularization - N gram - F1 Score on development set: 0.741111111111111


### Bernoulli Naive Bayes

In [250]:
# Compute the maximum likelihood model parameters on our dataset
n = train_ngram_bag_of_words.shape[0] # size of the dataset
d = train_ngram_bag_of_words.shape[1] # number of features in our dataset
K = 2 # number of clases

# these are the shapes of the parameters
psis = np.zeros([K,d])
phis = np.zeros([K])
alpha = 1 # additive smoothing 

# we now compute the parameters
for k in range(K):
    X_k = train_ngram_bag_of_words[train['target'] == k]
    # psis[k] = np.mean(X_k, axis=0)
    psis[k] = (X_k.sum(axis=0) + alpha) / (2 * alpha + X_k.shape[0])
    phis[k] = (X_k.shape[0]) / (float(n))

# print out the class proportions
print(phis)

[0.56671045 0.43328955]


In [253]:
# Train this classifier on the training set
idx_train_ngram, logpyx_train_ngram = nb_predictions(train_ngram_bag_of_words.toarray(), psis, phis)
idx_dev_ngram, logpyx_dev_ngram = nb_predictions(dev_ngram_bag_of_words.toarray(), psis, phis)
# Report its F1-score on the development set
print("Bernoulli Naive Bayes - N gram - F1 Score on training set: " 
          + str(f1_score(idx_train_ngram, train['target'])))
print("Bernoulli Naive Bayes - N gram - F1 Score on development set: " 
          + str(f1_score(idx_dev_ngram, dev['target'])))

Bernoulli Naive Bayes - N gram - F1 Score on training set: 0.7544574630667344
Bernoulli Naive Bayes - N gram - F1 Score on development set: 0.7045596502186133


## (i) Determine performance with the test set

In [254]:
# Pre-process data
df_train["text"] = df_train["text"].str.lower().apply(Lemmatize_Sentence).apply(Strip_Punct).apply(Strip_StopWord).apply(Strip_Url).apply(Strip_HTML)
df_test["text"] = df_test["text"].str.lower().apply(Lemmatize_Sentence).apply(Strip_Punct).apply(Strip_StopWord).apply(Strip_Url).apply(Strip_HTML)
# Tokenizing the pre-processed clean texts
df_train['tokenized'] = df_train["text"].tolist()
df_test['tokenized'] = df_test["text"].tolist()
# Re-build the feature vectors
train_BOW, data_cv = cv(df_train['tokenized'])
test_BOW = data_cv.transform(df_test['tokenized'])
# Re-train the preferred classifier - Logistic Regression with L2 regularization
LR_L2_Model = LogisticRegression(penalty='l2', solver='liblinear')
LR_L2_Model.fit(train_BOW, df_train['target'])
train_predicted = LR_L2_Model.predict(train_BOW)
test_predicted = LR_L2_Model.predict(test_BOW)
# save predictions
res=pd.DataFrame(columns = ['id', 'target'])
res['id'] = df_test['id']
res['target'] = test_predicted
res.to_csv('my_submission.csv',index=False)