In [839]:
import numpy as np
import csv
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [840]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/samylokanandi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/samylokanandi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/samylokanandi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [841]:
train = pd.read_csv("train.csv")

In [842]:
test = pd.read_csv("test.csv")

In [843]:
train.shape

(7613, 5)

In [844]:
test.shape

(3263, 4)

In [845]:
target_counts = train['target'].value_counts()
target_counts

target
0    4342
1    3271
Name: count, dtype: int64

In [846]:
total_tweets = len(train)
disaster_tweets = len(train[train['target'] == 1])
non_disaster_tweets = len(train[train['target'] == 0])

disaster_percentage = (disaster_tweets / total_tweets) * 100
non_disaster_percentage = (non_disaster_tweets / total_tweets) * 100

print(f"Percentage of real disaster tweets: {disaster_percentage:.2f}%")
print(f"Percentage of non-real disaster tweets: {non_disaster_percentage:.2f}%")

Percentage of real disaster tweets: 42.97%
Percentage of non-real disaster tweets: 57.03%


In [847]:
# Split the data: 70% training and 30% development
train_set, dev_set = train_test_split(train, test_size=0.3, random_state=42)
train_set.to_csv('train_set.csv', index=False)
dev_set.to_csv('dev_set.csv', index=False)

In [848]:
# PREPROCESSING 
def to_lowercase(text):
    return text.lower()
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text) 

In [849]:
def remove_mentions_and_urls(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+', '', text)
    return text

In [850]:
def handle_hashtags(text):
   
    return re.sub(r'#\w+', '', text)

In [851]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))  


In [852]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun 

In [853]:
def lemmatize_text(text):
    words = word_tokenize(text)  
    pos_tags = nltk.pos_tag(words)  
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]  # Lemmatize based on POS
    return ' '.join(lemmatized_words)  

In [854]:
def remove_stopwords(text):
    words = word_tokenize(text)
    return ' '.join([word for word in words if word.lower() not in stop_words])

In [855]:
def preprocess_text(text):
    text = to_lowercase(text)
    text = remove_mentions_and_urls(text)
    text = handle_hashtags(text)
    text = remove_punctuation(text)
    text = lemmatize_text(text)
    text = remove_stopwords(text)
    # text = replace_emojis(text)
    return text

In [856]:
train_set['cleaned_tweet'] = train_set['text'].apply(preprocess_text)
dev_set['cleaned_tweet'] = dev_set['text'].apply(preprocess_text)


In [857]:
# Because this is a moderate sized data set? setting at 5 but can change 
M = 5

vectorizer = CountVectorizer(binary=True, min_df=M)

# we need to transform X_dev so the same number of words is outputted
X_train = vectorizer.fit_transform(train_set['cleaned_tweet'])  # Fit on training data
X_dev = vectorizer.transform(dev_set['cleaned_tweet'])          # Transform the development set

In [858]:
print("Vocabulary:", vectorizer.get_feature_names_out())

print("\nTraining set feature vectors (binary Bag of Words):")
print(X_train.toarray())

print("\nDevelopment set feature vectors (binary Bag of Words):")
print(X_dev.toarray())

print(f"\nTotal number of features (vocabulary size): {len(vectorizer.get_feature_names_out())}")

Vocabulary: ['05' '06' '10' ... 'ûïwhen' 'ûò' 'ûó']

Training set feature vectors (binary Bag of Words):
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Development set feature vectors (binary Bag of Words):
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Total number of features (vocabulary size): 1757


In [859]:
y_train = train_set['target']
y_dev = dev_set['target']

In [860]:
# Log Regression
logreg = LogisticRegression(penalty=None, solver='lbfgs', max_iter=1000)

logreg.fit(X_train, y_train)

In [861]:
y_train_pred = logreg.predict(X_train)
y_dev_pred = logreg.predict(X_dev)

train_f1 = f1_score(y_train, y_train_pred, average='weighted')
dev_f1 = f1_score(y_dev, y_dev_pred, average='weighted')

print(f"F1 score on Training set: {train_f1}")
print(f"F1 score on Development set: {dev_f1}")


F1 score on Training set: 0.9396677033765817
F1 score on Development set: 0.7171230564353913


In [862]:
# seeing issues with overfitting 

In [863]:
# L1 Regularization 
logreg_l1 = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000)
logreg_l1.fit(X_train, y_train)

y_train_pred_l1 = logreg_l1.predict(X_train)
y_dev_pred_l1 = logreg_l1.predict(X_dev)

train_f1_l1 = f1_score(y_train, y_train_pred_l1, average='weighted')
dev_f1_l1 = f1_score(y_dev, y_dev_pred_l1, average='weighted')

print("Logistic Regression (L1 Regularization)")
print(f"F1 score on Training set: {train_f1_l1}")
print(f"F1 score on Development set: {dev_f1_l1}")

Logistic Regression (L1 Regularization)
F1 score on Training set: 0.8544795846242272
F1 score on Development set: 0.782350148570447


In [864]:
# L2 Regularization (which is the default) 
logreg_l2 = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000)
logreg_l2.fit(X_train, y_train)

y_train_pred_l2 = logreg_l2.predict(X_train)
y_dev_pred_l2 = logreg_l2.predict(X_dev)

train_f1_l2 = f1_score(y_train, y_train_pred_l2, average='weighted')
dev_f1_l2 = f1_score(y_dev, y_dev_pred_l2, average='weighted')

print("Logistic Regression (L2 Regularization)")
print(f"F1 score on Training set: {train_f1_l2}")
print(f"F1 score on Development set: {dev_f1_l2}")

Logistic Regression (L2 Regularization)
F1 score on Training set: 0.8734678718889471
F1 score on Development set: 0.7825975823699194


In [865]:
# the best performance observed was with L1 Regularization, I observed overfitting without regularizing the data, but once regularized but L1 and L2 gave similar results 

In [866]:
# Logistic Regression with L2 penalty
LR = LogisticRegression(penalty="l2", solver='liblinear', max_iter=2500)

LR.fit(X_train, y_train)

coeff = LR.coef_[0]

words = vectorizer.get_feature_names_out()

min_length = min(len(words), len(coeff))
words = words[:min_length]
coeff = coeff[:min_length]

df = pd.DataFrame({'word': words, 'weight': coeff})

top_words = df.sort_values(by='weight', ascending=False)

# TOP 10 WORDS
print(top_words.head(10))

            word    weight
502   earthquake  2.262849
441   derailment  1.979377
1437       spill  1.974917
1488     suicide  1.858534
1573     tornado  1.843853
389         crew  1.839026
1703    wildfire  1.810833
147       atomic  1.776742
949     massacre  1.764226
490      drought  1.761261


In [867]:
# NAIVE BAYES methods 
def compute_psis_and_phis(X, y, alpha=1.0):
    K = len(np.unique(y))
    n, d = X.shape

    # compute class priors
    phis = np.array([(y == k).mean() for k in np.unique(y)])

    # compute conditional probabilities
    psis = np.zeros((K, d))
    for k in range(K):
        X_class = X[y == k]
        psis[k, :] = (X_class.sum(axis=0) + alpha) / (X_class.shape[0] + 2 * alpha)

    return psis, phis

psis, phis = compute_psis_and_phis(X_train, y_train)

def nb_predictions(x, psis, phis):
    n, d = x.shape
    K = psis.shape[0]

    psis = psis.clip(1e-14, 1 - 1e-14)

    logpy = np.log(phis).reshape([K, 1])

    logpxy = x @ np.log(psis.T) + (1 - x) @ np.log(1 - psis.T)
    logpyx = logpxy + logpy.T

    return logpyx.argmax(axis=1), logpyx

idx_train, logpyx = nb_predictions(X_train.toarray(), psis, phis)


In [868]:
# F1 Scores for Naive Bayes
train_f1_bayes = f1_score(y_train, y_train_pred, average='weighted')
dev_f1_bayes = f1_score(y_dev, y_dev_pred, average='weighted')

print(f"F1 score on the training set: {train_f1_bayes}")
print(f"F1 score on the development set: {dev_f1_bayes}")

F1 score on the training set: 0.9396677033765817
F1 score on the development set: 0.7171230564353913


In [869]:
M=5
n_gram_vectorizer = CountVectorizer(ngram_range=(2, 2), binary=True, min_df=2)

X_train_2gram = n_gram_vectorizer.fit_transform(train_set['cleaned_tweet'])

X_dev_2gram = n_gram_vectorizer.transform(dev_set['cleaned_tweet'])

vocab_size = len(vectorizer.get_feature_names_out())
print(f"Total number of 2-grams in the vocabulary: {vocab_size}")

# 10 GRAMS
sample_ngrams = vectorizer.get_feature_names_out()[:10]
print("10 sample 2-grams:", sample_ngrams)
print(f"Test: {f1_score(y_dev, y_dev_pred)}")

Total number of 2-grams in the vocabulary: 1757
10 sample 2-grams: ['05' '06' '10' '100' '1000' '101' '11' '11yearold' '12' '12000']
Test: 0.6730575176589304


In [870]:
print("Prior probabilities (phis):", phis)


Prior probabilities (phis): [0.56746106 0.43253894]


In [871]:
# LOGISTIC REGRESSION
logreg = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000)
logreg.fit(X_train_2gram, y_train)

y_train_pred = logreg.predict(X_train_2gram)
y_dev_pred = logreg.predict(X_dev_2gram)

train_f1 = f1_score(y_train, y_train_pred, average='weighted')
dev_f1 = f1_score(y_dev, y_dev_pred, average='weighted')

# Print F1-scores
print(f"F1 score on training set (Logistic Regression, 2-grams): {train_f1}")
print(f"F1 score on development set (Logistic Regression, 2-grams): {dev_f1}")

F1 score on training set (Logistic Regression, 2-grams): 0.7976137365267141
F1 score on development set (Logistic Regression, 2-grams): 0.7003067404182036


In [872]:
# NAIVE BAYES
psis_2gram, phis_2gram = compute_psis_and_phis(X_train_2gram.toarray(), y_train)
y_train_pred_2gram, _ = nb_predictions(X_train_2gram.toarray(), psis_2gram, phis_2gram)

y_dev_pred_2gram, _ = nb_predictions(X_dev_2gram.toarray(), psis_2gram, phis_2gram)

train_f1_2gram = f1_score(y_train, y_train_pred_2gram, average='weighted')
print(f"F1 score on training set (2-gram, Naive Bayes): {train_f1_2gram}")

dev_f1_2gram = f1_score(y_dev, y_dev_pred_2gram, average='weighted')
print(f"F1 score on development set (2-gram, Naive Bayes): {dev_f1_2gram}")

F1 score on training set (2-gram, Naive Bayes): 0.730673181325753
F1 score on development set (2-gram, Naive Bayes): 0.6627108142989363


In [873]:
# KAGGLE
combined_text = pd.concat([train_set['cleaned_tweet'], dev_set['cleaned_tweet']])

vectorizer = CountVectorizer(ngram_range=(2, 2), binary=True, min_df=M)
X_full = vectorizer.fit_transform(combined_text)  

X_test_kaggle = vectorizer.transform(test['text'])
logreg_full = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000)
logreg_full.fit(X_full, y_full) 
y_test_pred = logreg_full.predict(X_test_kaggle)

In [874]:
submission = pd.DataFrame({
    'id': test['id'],
    'target': y_test_pred
})

submission.to_csv('kaggle_submission.csv', index=False)