<span style="font-size: 24px;">Binary Classification on Text Data</span>

<span style="font-size: 18px;">Part 1</span>

In [7]:
import numpy as np
import torch
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from matplotlib import pyplot as plt
import seaborn as sns
import string
import re
from nltk.stem import WordNetLemmatizer as wnl
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')



import torch.autograd as tag

[nltk_data] Downloading package punkt to C:\Users\Sofia
[nltk_data]     Beyerlein\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Sofia
[nltk_data]     Beyerlein\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Sofia
[nltk_data]     Beyerlein\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Sofia
[nltk_data]     Beyerlein\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [8]:
#DOWNLOAD THE DATA
filepath_train = (r"C:\Users\Sofia Beyerlein\Desktop\Cornell Graduate\Applied Machine Learning\hw2\nlp-getting-started\train.csv")
filepath_test = (r'C:\Users\Sofia Beyerlein\Desktop\Cornell Graduate\Applied Machine Learning\hw2\nlp-getting-started\test.csv')

train = pd.read_csv(filepath_train)
test = pd.read_csv(filepath_test)

In [9]:
#SPLITTING THE DATA
#70% -> 5329/7613 and 30% -> 2284
training_set = train.sample(frac=0.7)
dev_set = train.drop(training_set.index)

In [10]:
def preprocess_data(df):
    words_to_remove = {'the', 'and', 'or'}
    #lowercase
    df['text'] = df['text'].apply(lambda x: x.lower())
    #remove @ and urls
    df['text'] = df['text'].apply(lambda x: re.sub(r'@\S+', '', x))
    #remove # and hashtags
    df['text'] = df['text'].apply(lambda x: re.sub(r'#\S+', '', x))
    #strip punctuation
    df['text'] = df['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
    #strip the and or
    df['text'] = df['text'].apply(lambda x: ' '.join(word for word in x.split() if word not in words_to_remove))
    #lemmatize
    lemmatizer = nltk.WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    def lemmatize_text(text):
        tokens = nltk.word_tokenize(text)
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        return ' '.join(lemmatized_tokens)
    
    df['text'] = df['text'].apply(lemmatize_text)
    
    return df
    
        
preprocess_data(training_set)
preprocess_data(dev_set)

Unnamed: 0,id,keyword,location,text,target
2,5,,,resident asked shelter place notified officer ...,1
6,10,,,heavy rain cause flash flooding street manitou...,1
11,17,,,haha south tampa getting flooded hah wait seco...,1
13,19,,,bago myanmar arrived bago,1
17,25,,,summer lovely,0
...,...,...,...,...,...
7572,10823,wrecked,"Manhattan, NY",get wrecked,1
7591,10846,,,heat wave warning aa ayyo dei plan visit frien...,1
7595,10850,,,nw flash flood warning continued shelby county...,1
7602,10860,,,siren went wasnt forney tornado warning,1


In [12]:
M = 3
vectorizer = CountVectorizer(binary=True, min_df=M)

train_vectors = vectorizer.fit_transform(training_set['text'])
dev_vectors = vectorizer.transform(dev_set['text'])

<span style = "font-size: 18px;">Part 1 a: Bernoulli Naive Bayes</span>

In [14]:
# Define functions for the Bernoulli Naive Bayes Classifier with Laplace Smoothing
#
def calculate_class_priors(y):
    num_docs = len(y)
    class_priors = np.bincount(y) / num_docs
    return class_priors

def calculate_feature_probs(X, y, alpha=1):
    num_docs, num_features = X.shape
    feature_probs = np.zeros((2, num_features))
    
    for k in range(2):
        class_docs = X[y == k]
        feature_probs[k] = (class_docs.sum(axis=0) + alpha) / (class_docs.shape[0] + 2 * alpha)
    
    return feature_probs

def predict_log_proba(X, class_priors, feature_probs):
    num_docs, num_features = X.shape
    log_probs = np.zeros((num_docs, 2))
    
    for k in range(2):
        # Log probability for class k
        log_prob_k = np.log(class_priors[k])
        log_prob_x_given_k = X @ np.log(feature_probs[k]) + (1 - X) @ np.log(1 - feature_probs[k])
        log_probs[:, k] = log_prob_k + log_prob_x_given_k
        
    return log_probs

def predict(X, class_priors, feature_probs):
    log_probs = predict_log_proba(X, class_priors, feature_probs)
    return np.argmax(log_probs, axis=1)

# Calculate class priors
class_priors = calculate_class_priors(training_set['target'].values)

# Calculate feature probabilities with Laplace smoothing
feature_probs = calculate_feature_probs(train_vectors.toarray(), training_set['target'].values, alpha=1)

# Predict on the development set
dev_predictions = predict(dev_vectors.toarray(), class_priors, feature_probs)

# Calculate the F1 score for the development set
f1 = f1_score(dev_set['target'], dev_predictions)
print(f"F1 Score: {f1}")

F1 Score: 0.7589743589743588
