In [8]:
import pandas as pd
import numpy as np

df_train = pd.read_csv('./nlp-getting-started/train.csv', index_col='id')
df_test = pd.read_csv('./nlp-getting-started/test.csv', index_col='id')

# real disaster = 1, not a disaster = 0

In [9]:
df_train.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [71]:
df_train.shape

(7613, 6)

In [10]:
# calculating a baseline

avg_target = sum(df_train['target']) / len(df_train['target'])
print(avg_target)
df_train['baseline_pred'] = 0 if avg_target < 0.5 else 1
train_baseline_accuracy = len(df_train.loc[df_train['target']==df_train['baseline_pred']]) / len(df_train)
train_baseline_accuracy

0.4296597924602653


0.5703402075397347

In [11]:
# baseline is 57% accuracy

# Data Exploration

In [73]:
df_train['split_text'] = df_train['text'].apply(lambda x: x.split())
df_train['split_text']

avg_words_per_tweet = np.mean(df_train['split_text'].apply(lambda x: len(x)))
min_tweet_len = np.min(df_train['split_text'].apply(lambda x: len(x)))
max_tweet_len = np.max(df_train['split_text'].apply(lambda x: len(x)))
print(f"-- Tweet Stats --\nNumber of tweets in test dataset = {len(df_train)}\nAverage words per tweet = {avg_words_per_tweet.round(2)}\nRanging between {min_tweet_len} words and {max_tweet_len} words")

-- Tweet Stats --
Number of tweets in test dataset = 7613
Average words per tweet = 14.9
Ranging between 1 words and 31 words


## Rough plan of attack:
Clean text

TFIDF (generate features)

ML Models:

1). Random forest

2). Deep NN

In [25]:
# Generate features for each tweet
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf
vectorizer = tfidf()
X = vectorizer.fit_transform(df_train['text'])
print(vectorizer.get_feature_names_out())
print(X.shape)

['00' '000' '0000' ... 'ûónegligence' 'ûótech' 'ûówe']
(7613, 21637)


### Without cleaning the tweets first we are left with a giant sparse matrix with a shape of (7613, 21637)

In [62]:
# Download nltk stopwords seperately due to SSL errors
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /Users/petr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/petr/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [63]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

textRaw = df_train['text']

# Remove all non-alphanumeric characters excluding the hashtag. Remove any website links. Set all words to lowercase
text01 = [re.sub("[^#a-zA-Z 0-9]+|(\w+:\/\/\S+)|^rt|http.+?", "", x).lower() for x in textRaw]

# Remove stopwords
stop = stopwords.words('english')
text02 = [" ".join([word for word in tweet.split() if word not in (stop)]) for tweet in text01]

# Stemming
stemmer = PorterStemmer()
text03 = [" ".join([stemmer.stem(word) for word in tweet.split()]) for tweet in text02]

# Lemmatization
lemmatizer = WordNetLemmatizer()
text04 = [" ".join([lemmatizer.lemmatize(word) for word in tweet.split()]) for tweet in text02]

text04

['deed reason #earthquake may allah forgive u',
 'forest fire near la ronge sask canada',
 'resident asked shelter place notified officer evacuation shelter place order expected',
 '13000 people receive #wildfires evacuation order california',
 'got sent photo ruby #alaska smoke #wildfires pours school',
 '#rockyfire update california hwy 20 closed direction due lake county fire #cafire #wildfires',
 '#flood #disaster heavy rain cause flash flooding street manitou colorado spring area',
 'im top hill see fire wood',
 'there emergency evacuation happening building across street',
 'im afraid tornado coming area',
 'three people died heat wave far',
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck #flooding',
 '#raining #flooding #florida #tampabay #tampa 18 19 day ive lost count',
 '#flood bago myanmar #we arrived bago',
 'damage school bus 80 multi car crash #breaking',
 'whats man',
 'love fruit',
 'summer lovely',
 'car fast',
 'goooooooaaaaaal',
 

### It seems like the stemming and lemmatizer are cutting off more character than they should be, so for now I will proceed with the modelling using text02 (all preprocessing less stemming and lemmatization)

In [74]:
# Generating tfidf feature matrix using preprocessed words

vectorizer = tfidf()
X2 = vectorizer.fit_transform(text02)
print(vectorizer.get_feature_names_out())
print(X2.shape)

# Testing to see how big the vocab would be with stemming and lemmatization

X3 = vectorizer.fit_transform(text03)
print(vectorizer.get_feature_names_out())
print(f"Shape of tfidf matrix with de-stemmed tweets = {X3.shape}")

X4 = vectorizer.fit_transform(text04)
print(vectorizer.get_feature_names_out())
print(f"Shape of tfidf matrix with de-lemmatized tweets = {X4.shape}")


['0011' '001116' '0025' ... 'zurich' 'zxathetis' 'zzzz']
(7613, 17808)
['0011' '001116' '0025' ... 'zurich' 'zxatheti' 'zzzz']
Shape of tfidf matrix with de-stemmed tweets = (7613, 14797)
['0011' '001116' '0025' ... 'zurich' 'zxathetis' 'zzzz']
Shape of tfidf matrix with de-lemmatized tweets = (7613, 16611)


### Results: preprocessing the tweets cut down size of vocabulary from ~21.6K to 17.8K
### De-stemmed vocab size = 14.8K, de-lemmatized vocab size = 16.6K