In [12]:
import pandas as pd
import numpy as np

df_train = pd.read_csv('./nlp-getting-started/train.csv', index_col='id')
df_test = pd.read_csv('./nlp-getting-started/test.csv', index_col='id')

# real disaster = 1, not a disaster = 0

In [13]:
df_train.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [14]:
df_train.shape

(7613, 4)

In [15]:
# calculating a baseline

avg_target = sum(df_train['target']) / len(df_train['target'])
print(avg_target)
df_train['baseline_pred'] = 0 if avg_target < 0.5 else 1
train_baseline_accuracy = len(df_train.loc[df_train['target']==df_train['baseline_pred']]) / len(df_train)
train_baseline_accuracy

0.4296597924602653


0.5703402075397347

In [16]:
# baseline is 57% accuracy

# Data Exploration

In [17]:
df_train['split_text'] = df_train['text'].apply(lambda x: x.split())
df_train['split_text']

avg_words_per_tweet = np.mean(df_train['split_text'].apply(lambda x: len(x)))
min_tweet_len = np.min(df_train['split_text'].apply(lambda x: len(x)))
max_tweet_len = np.max(df_train['split_text'].apply(lambda x: len(x)))
print(f"-- Tweet Stats --\nNumber of tweets in test dataset = {len(df_train)}\nAverage words per tweet = {avg_words_per_tweet.round(2)}\nRanging between {min_tweet_len} words and {max_tweet_len} words")

-- Tweet Stats --
Number of tweets in test dataset = 7613
Average words per tweet = 14.9
Ranging between 1 words and 31 words


## Rough plan of attack:
Clean text

TFIDF (generate features)

ML Models:

1). Random forest

2). Deep NN

In [18]:
# Generate features for each tweet
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf
vectorizer = tfidf()
X = vectorizer.fit_transform(df_train['text'])
print(vectorizer.get_feature_names_out())
print(X.shape)

['00' '000' '0000' ... 'ûónegligence' 'ûótech' 'ûówe']
(7613, 21637)


### Without cleaning the tweets first we are left with a giant sparse matrix with a shape of (7613, 21637)

In [19]:
# Download nltk stopwords seperately due to SSL errors
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /Users/petr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/petr/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [20]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


def preprocess_tweets(df, stem=False, lemmatize=False):
    """
    Function that takes a kaggle provided tweet dataset, identifies the text column, and cleans the words in the tweets
    
    Cleaning steps:
    1). Removes all non alphanumeric characters excluding the hashtag #
    2). Removes common english stopwords
    3). Sets all characters to lowercase
    4). OPTIONAL: destems each word in each tweet
    5). OPTIONAL: delemmatizes each word in each tweet
    
    
    RETURNS:
    A list of preprocessed tweets
    """
    textRaw = df['text']
    # Remove all non-alphanumeric characters excluding the hashtag. Remove any website links. Set all words to lowercase
    text01 = [re.sub("[^#a-zA-Z 0-9]+|(\w+:\/\/\S+)|^rt|http.+?", "", x).lower() for x in textRaw]

    # Remove stopwords
    stop = stopwords.words('english')
    text02 = [" ".join([word for word in tweet.split() if word not in (stop)]) for tweet in text01]
    
    # Stemming
    if stem==True:
        stemmer = PorterStemmer()
        text03 = [" ".join([stemmer.stem(word) for word in tweet.split()]) for tweet in text02]

    # Lemmatization
    if lemmatize==False:
        lemmatizer = WordNetLemmatizer()
        if stem == True:
            text04 = [" ".join([lemmatizer.lemmatize(word) for word in tweet.split()]) for tweet in text03]
        else:
            text05 = [" ".join([lemmatizer.lemmatize(word) for word in tweet.split()]) for tweet in text02]

    if stem == lemmatize == False:
        return text02
    elif stem == True and lemmatize == False:
        return text03
    elif stem == False and lemmatize == True:
        return text05
    else:
        return text04

### It seems like the stemming and lemmatizer are cutting off more character than they should be, so for now I will proceed with the modelling using text02 (all preprocessing less stemming and lemmatization)

In [21]:
# Generating tfidf feature matrix using preprocessed words

vectorizer = tfidf()

text02 = preprocess_tweets(df_train)

X2 = vectorizer.fit_transform(text02)
print(vectorizer.get_feature_names_out())
print(X2.shape)

# Testing to see how big the vocab would be with stemming and lemmatization

# X3 = vectorizer.fit_transform(text03)
# print(vectorizer.get_feature_names_out())
# print(f"Shape of tfidf matrix with de-stemmed tweets = {X3.shape}")

# X4 = vectorizer.fit_transform(text04)
# print(vectorizer.get_feature_names_out())
# print(f"Shape of tfidf matrix with de-lemmatized tweets = {X4.shape}")


['0011' '001116' '0025' ... 'zurich' 'zxathetis' 'zzzz']
(7613, 17808)


### Results: preprocessing the tweets cut down size of vocabulary from ~21.6K to 17.8K
### De-stemmed vocab size = 14.8K, de-lemmatized vocab size = 16.6K

## Modelling time!

In [22]:
# Creating validation set
from sklearn.model_selection import train_test_split

y = df_train['target']
X_train, X_val, y_train, y_val = train_test_split(X2, y, test_size=0.2)
X_test = df_test['text']

In [92]:
# Random Forest classifier
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.metrics import accuracy_score

clf = rf()
clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
y_train_acc = accuracy_score(y_train, y_train_pred)

y_val_pred = clf.predict(X_val)
y_val_acc = accuracy_score(y_val, y_val_pred)

print(F"-- EXPERIMENT RESULTS --\nTRAIN BASELINE ACCURACY = {train_baseline_accuracy}\nRF TRAINING SET ACCURACY = {y_train_acc}"
    F"\nVALIDATION SET ACCURACY = {y_val_acc}")

-- EXPERIMENT RESULTS --
TRAIN BASELINE ACCURACY = 0.5703402075397347
RF TRAINING SET ACCURACY = 0.9883415435139573
VALIDATION SET ACCURACY = 0.7708470124753776


In [26]:
X_train

<6090x17808 sparse matrix of type '<class 'numpy.float64'>'
	with 54849 stored elements in Compressed Sparse Row format>

In [34]:
# XGBoost
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

model = XGBClassifier(n_estimators=300, max_depth=5)
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_train_acc = accuracy_score(y_train, y_train_pred)

y_val_pred = model.predict(X_val)
y_val_acc = accuracy_score(y_val, y_val_pred)

print(F"-- EXPERIMENT RESULTS --\nTRAIN BASELINE ACCURACY = {train_baseline_accuracy}\nXGB TRAINING SET ACCURACY = {y_train_acc}"
    F"\nXGB VALIDATION SET ACCURACY = {y_val_acc}")



-- EXPERIMENT RESULTS --
TRAIN BASELINE ACCURACY = 0.5703402075397347
XGB TRAINING SET ACCURACY = 0.9022988505747126
XGB VALIDATION SET ACCURACY = 0.7636244254760342


In [96]:
# Submitting my first Kaggle submission

X_test_cleaned = preprocess_tweets(df_test)

X_test = vectorizer.transform(X_test_cleaned)

sample_sub = pd.read_csv("./nlp-getting-started/sample_submission.csv", index_col = 'id')
y_test_pred = clf.predict(X_test)
sample_sub['target'] = y_test_pred
sample_sub.head()

In [98]:
# Saving my first kaggle submission
import os.path
from os import path

def save_submission(pred):
    counter = 0
    name = F"kaggle_submission_00{counter}"
    while path.exists(f"{name}.csv"):
        counter += 1
        if counter == 100:
            break
    pred.to_csv(f"{name}{counter}.csv")

In [100]:
save_submission(sample_sub)