# task-04 tweet disaster prediction
# EDA

In [None]:
import os
import re
import string
from typing import Tuple
from collections import Counter
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk

plt.style.use('seaborn-deep')

In [None]:
for parent, _, files in os.walk('./input/'):
    for file in files:
        print(os.path.join(parent, file))

## Data load

In [None]:
# for running locally
# nltk_data = os.path.join('..', 'venv_jupyter', 'nltk_data')

# nltk.download('stopwords', download_dir=nltk_data, quiet=True)
# nltk.download('wordnet', download_dir=nltk_data, quiet=True)

# for running in kaggle
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

This dataset is small enough to fit in memory, so specifying column types will not be necessary.

In [None]:
# df_train_raw = pd.read_csv('input/train.csv')
df_train_raw = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')

In [None]:
df_train_raw.info(memory_usage='deep')

## Fixing column names
*Cleaning before the cleaning*

In [None]:
def fix_col_names(_df: pd.DataFrame, silent: bool = False) -> pd.DataFrame:
    ''' Strips and lowercases DataFrame columns '''
    df = _df.copy()
    og_cols = df.columns.to_list()

    df.columns = df.columns \
        .str.lower() \
        .str.strip()
    
    max_col_name_size = len(max(og_cols, key=lambda x: len(x))) + 5
    if not silent:
        for og, new in zip(og_cols, df.columns.to_list()):
            print(f'{og: <{max_col_name_size}} -> {new}')

    return df

In [None]:
df = fix_col_names(df_train_raw)

## Shapes, sizes and *nulls*

### Shape

In [None]:
print(f'{df.shape[0]} rows, {df.shape[1]} columns')

### Missing values

In [None]:
_missing_values = (df.isnull().sum() / df.shape[0] * 100) \
    .reset_index() \
    .rename({
        'index': 'Column', 0: 'Missing percentage'
    }, axis=1)

In [None]:
_missing_values.sort_values('Missing percentage', ascending=False)

`location` has about `33%` of its data missing

`keyword` has a much lower percentage, only `0.8%` is missing.

### Unique data

In [None]:
_unique_values = (df.nunique() / df.shape[0] * 100) \
    .reset_index() \
    .rename({
        'index': 'Column', 0: 'Unique percentage'
    }, axis=1)

In [None]:
_unique_values.sort_values('Unique percentage', ascending=False)

`text` contains over `98%` unique values, which is to be expected

`location` unique values account for almost `44%` of the total

`keyword`, on the other hand, has a much lower `3%` of its values being unique

### Target balance

In [None]:
(df['target'].value_counts() / df.shape[0] * 100)

A `57%/43%` ratio is balanced enough for this kind of work.

---

## Exploration

### Length

In [None]:
df_stat = df.copy()
df_stat['length'] = df_stat['text'].str.len()

In [None]:
df_stat.groupby('target')['length'].mean()

In [None]:
g = sns.FacetGrid(data=df_stat, hue='target', height=7, aspect=16/9)
g.map(sns.histplot, 'length')
g.add_legend()

### Unique
*Are there more unique words in disaster or non-disaster?*

In [None]:
df_stat['unique_count'] = df_stat['text'].apply(lambda x: len(set(x.split())))

In [None]:
df_stat.groupby('target')['unique_count'].mean()

In [None]:
g = sns.FacetGrid(data=df_stat, hue='target', height=7, aspect=16/9)
g.map(sns.histplot, 'unique_count')
g.add_legend()

Almost a perfect normal distribution, with non-disaster tweets having slightly ```(4%)``` less unique words.

### Stopwords
*Do disaster tweets contain more stopwords?*

In [None]:
stop_words = nltk.corpus.stopwords.words('english')

In [None]:
def get_series_stop_words_count(series: pd.Series, stop_words: list = stop_words) -> pd.Series:
    pattern_stop_words = r'\b({})\b'.format('|'.join(stop_words))
    return series.str.count(pattern_stop_words)
        
df_stat['stopword_count'] = get_series_stop_words_count(df['text'])

In [None]:
df_stat.groupby('target')['stopword_count'].mean()

In [None]:
g = sns.FacetGrid(data=df_stat, hue='target', height=7, aspect=16/9)
g.map(sns.histplot, 'stopword_count')
g.add_legend()

Non-disaster tweets contain ```10%``` more stopwords on average.

### Hyperlinks
*Do disaster tweets contain more hyperlinks?*

In [None]:
def get_series_hyperlink_count(series: pd.Series) -> pd.Series:
    pattern_hyperlink = r'(http)(s)?(://)'  # always matches (http), try to match (s), always matches (://)
    return series.str.count(pattern_hyperlink)

df_stat['hyperlink_count'] = get_series_hyperlink_count(df_stat['text'])

In [None]:
df_stat.groupby('target')['hyperlink_count'].mean()

In [None]:
g = sns.FacetGrid(data=df_stat, hue='target', height=7, aspect=16/9)
g.map(sns.histplot, 'hyperlink_count')
g.add_legend()

Yep, disaster tweets contain more than ```50%``` more hyperlinks than non-disaster tweets.

### Emoji/emoticons
*Do disaster tweets contain more or less emojis than non-disaster ones?*

In [None]:
def get_series_emoji_count(series: pd.Series) -> pd.Series:
    unicode_emoji_list = [
        '[\U0001F600-\U0001F64F]', '[\U0001F300-\U0001F5FF]', '[\U0001F680-\U0001F6FF]',
        '[\U0001F1E0-\U0001F1FF]', '[\U00002702-\U000027B0]', '[\U000024C2-\U0001F251]'
    ]

    pattern_emoji = r'({})'.format('|'.join(unicode_emoji_list))
    return series.str.count(pattern_emoji, flags=re.UNICODE)

get_series_emoji_count(df_stat['text']).sum()

Apparently, there are no emojis in this dataset.

### Hashtags and mentions
*What about hashtags and mentions?*

In [None]:
def get_mention_hashtags_count(series: pd.Series) -> Tuple[pd.Series, pd.Series]:
    pattern_mentions = '([@])'
    pattern_hashtags = '([#])'
    mentions_count = series.str.count(pattern_mentions)
    hashtags_count = series.str.count(pattern_hashtags)
    
    return mentions_count, hashtags_count
    
df_stat['hashtags_count'], df_stat['mentions_count'] = get_mention_hashtags_count(df_stat['text'])

In [None]:
df_stat.groupby('target')['hashtags_count'].mean()

In [None]:
df_stat.groupby('target')['mentions_count'].mean()

   - Non-disaster tweets have, on average, ```54%``` more hashtags than disaster tweets.
   - On mentions however, disaster tweets show a substantially higher (```32%```) count.

### Part of speech (POS)
*Does the POS distribution vary between disaster and non-disaster?*

In [None]:
def get_series_part_of_speech(series: pd.Series) -> pd.DataFrame:
    ''' Gets count for each type of POS '''
    pattern_punctuation = r'[^\w\s]'
    series = series.str.replace(pattern_punctuation, ' ')
    series = series.apply(lambda x: nltk.word_tokenize(x))
    series = series.apply(lambda x: nltk.pos_tag(x))
    pos_df = pd.json_normalize(series.apply(lambda x: Counter(elem[1] for elem in x)))

    return pos_df

In [None]:
pos_df = get_series_part_of_speech(df_stat['text'])
pos_df['target'] = df_stat['target']

In [None]:
melt_pos_df = pd.melt(
    pos_df.groupby('target').mean().reset_index(),
    id_vars=['target']
)
melt_pos_df

In [None]:
# g = sns.FacetGrid(data=melt_pos_df, height=7, aspect=16/9)
# g.map(sns.barplot, 'variable', 'value', 'target')
# g.add_legend()
ax: plt.axes = sns.barplot(x='variable', y='value', hue='target', data=melt_pos_df)
ax.figure.set_size_inches(14, 7)

There seems to be a higher occurrence of NN (singular noun) and NNP (singular proper noun) in disaster tweets.

In [None]:
df_stat = df_stat.join(pos_df.drop('target', axis=1))

### Named entity recognition (NER)
*Is there a difference between NER counts in disaster and non-disaster tweets?*

> Note: NLTK doesn't do a good job here. It considers almost anything that starts with an uppercase letter as an entity, but I'll add it anyway.

In [None]:
def get_named_entities_count(quote: str):
    ''' Extracts named entities from quote'''
    words = nltk.word_tokenize(quote)
    tags = nltk.pos_tag(words)
    tree = nltk.ne_chunk(tags, binary=True)
    return len(set(
        ' '.join(i[0] for i in t)
        for t in tree
        if hasattr(t, 'label') and t.label() == 'NE'
    ))

In [None]:
%%time
df_stat['ner_count'] = df_stat['text'].apply(lambda x: get_named_entities_count(x))
df_stat.groupby('target')['ner_count'].mean()

In [None]:
g = sns.FacetGrid(data=df_stat, hue='target', height=7, aspect=16/9)
g.map(sns.histplot, 'ner_count')
g.add_legend()

Even with NLTK's difficulty, we can still see a clear trend here:
   - Almost all tweets have at least one named entity;
   - Disaster tweets have almost ```50%``` more of them.

### Stemms
*Does stemming the tokens change the trend observed in unique count?*

In [None]:
_0 = df_stat['text'][0]

In [None]:
def get_stemm_count(series: pd.Series) -> pd.Series:
    stemmer = nltk.stem.snowball.SnowballStemmer('english')
    series = series.apply(nltk.word_tokenize)
    series = series.apply(lambda x: len([stemmer.stem(w) for w in x]))
    
    return series

In [None]:
%%time
df_stat['stemm_count'] = get_stemm_count(df_stat['text'])

In [None]:
df_stat.groupby('target')['stemm_count'].mean()

In [None]:
g = sns.FacetGrid(data=df_stat, hue='target', height=7, aspect=16/9)
g.map(sns.histplot, 'stemm_count')
g.add_legend()

Nope, trend here is the same as unique word count.

### Lemmas
*What about lemmas?*

In [None]:
def get_lemma_count(series: pd.Series) -> pd.Series:
    lemmatizer = nltk.stem.WordNetLemmatizer()
    series = series.apply(nltk.word_tokenize)
    series = series.apply(lambda x: len([lemmatizer.lemmatize(w) for w in x]))

    return series

In [None]:
%%time
df_stat['lemma_count'] = get_lemma_count(df_stat['text'])

In [None]:
df_stat.groupby('target')['lemma_count'].mean()

In [None]:
g = sns.FacetGrid(data=df_stat, hue='target', height=7, aspect=16/9)
g.map(sns.histplot, 'lemma_count')
g.add_legend()

Identical to stemm count.

---

## DataFrame after exploration

In [None]:
df_stat

In [None]:
df_stat.columns

```44``` columns were added, containing basic numerical data about the text itself.

Let's see if those are enough to get some predictions.

---

# Basic models

## Auxiliary functions

In [None]:
from pprint import pprint
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    accuracy_score,
    balanced_accuracy_score,
    precision_score,
    recall_score, 
    f1_score
)
from datetime import datetime

all_scores = {}

def show_scores(scores: dict = all_scores):
    display(pd.DataFrame(all_scores.values()))

def calculate_scores(
    model_name: str, 
    y_test: pd.Series, pred: np.ndarray, 
    pos_label: str or int = 1,
    score_list: list = all_scores,
    display_confusion_matrix: bool = True,
    display_scores: bool = True
    ):
    cf_mx = ConfusionMatrixDisplay.from_predictions(y_test, pred)
    acc = accuracy_score(y_test, pred)
    bacc = balanced_accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, pos_label=pos_label)
    recall = recall_score(y_test, pred, pos_label=pos_label)
    fscore = f1_score(y_test, pred)
    
    model_scores = {
        'model_name': model_name,
        'acc': acc,
        'bacc': bacc,
        'precision': precision,
        'recall': recall,
        'fscore': fscore
    }
    score_list[model_name] = model_scores
    
    if display_confusion_matrix:
        display(cf_mx)   
    if show_scores:
        show_scores(score_list)
    return model_scores

def export_pred(model_name: str, raw_submission_set: pd.DataFrame, predictions):  
    # predictions = np.vectorize({0: 'L', 1: 'W'}.get)(predictions)  # encode back
    
    submission = pd.DataFrame({
        'id': raw_submission_set['id'],
        'target': predictions
    })

    timestamp = datetime.strftime(datetime.now(), '%d-%m-%y_%H%M')
    # submission.to_csv(f'./output/{model_name}_{timestamp}.csv', index=False)
    submission.to_csv(f'/kaggle/working/{model_name}_{timestamp}.csv', index=False)

## 1. SVC

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [None]:
x = df_stat.copy().drop(['target', 'text', 'keyword', 'location'], axis=1)
y = df_stat.copy()['target']

In [None]:
x = x.fillna(0)

In [None]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state=22)

In [None]:
%%time
basic_svc = SVC(kernel='poly', degree=6)
basic_svc.fit(train_x, train_y)

In [None]:
_preds = basic_svc.predict(test_x)

In [None]:
calculate_scores('basic_svc', test_y, _preds)

## 2. Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
x = df_stat.copy().drop(['target', 'text', 'keyword', 'location'], axis=1)
y = df_stat.copy()['target']

In [None]:
x = x.fillna(0)

In [None]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state=22)

In [None]:
basic_gaussiannb = GaussianNB()
basic_gaussiannb.fit(train_x, train_y)

In [None]:
_preds = basic_gaussiannb.predict(test_x)

In [None]:
calculate_scores('basic_gaussiannb', test_y, _preds)

## 3. Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
x = df_stat.copy().drop(['target', 'text', 'keyword', 'location'], axis=1)
y = df_stat.copy()['target']

In [None]:
x = x.fillna(0)

In [None]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state=22)

In [None]:
basic_rfc = RandomForestClassifier()
basic_rfc.fit(train_x, train_y)

In [None]:
_preds = basic_rfc.predict(test_x)

In [None]:
calculate_scores('basic_rfc', test_y, _preds)

### 3.1 More trees

In [None]:
bigger_rfc = RandomForestClassifier(n_estimators=1000)
bigger_rfc.fit(train_x, train_y)

In [None]:
_preds = bigger_rfc.predict(test_x)

In [None]:
calculate_scores('bigger_rfc', test_y, _preds)

### 3.2 Some random parameters

In [None]:
params_rfc = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    bootstrap=False
)
params_rfc.fit(train_x, train_y)

In [None]:
_preds = params_rfc.predict(test_x)

In [None]:
calculate_scores('params_rfc', test_y, _preds)

## 4. XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
x = df_stat.copy().drop(['target', 'text', 'keyword', 'location'], axis=1)
y = df_stat.copy()['target']

In [None]:
x = x.fillna(0)

In [None]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state=22)

In [None]:
basic_xgb = XGBClassifier()
basic_xgb.fit(train_x, train_y)

In [None]:
_preds = basic_xgb.predict(test_x)

In [None]:
calculate_scores('basic_xgb', test_y, _preds)

### 4.1. More boosted trees

In [None]:
bigger_xgb = XGBClassifier(
    n_estimators=1000,
    learning_rate=0.01,
    n_jobs=-1
    
)
bigger_xgb.fit(train_x, train_y)

In [None]:
_preds = bigger_xgb.predict(test_x)

In [None]:
calculate_scores('bigger_xgb', test_y, _preds)

#### 4.1.1 Feature importances

In [None]:
_feature_importances = []
for c, f in zip(train_x.columns, bigger_xgb.feature_importances_):
    _feature_importances.append({'feature': c, 'importance': f})

In [None]:
pd.DataFrame(_feature_importances).sort_values('importance', ascending=False)

### 4.2 XGBoost submission

In [None]:
full_x = pd.concat([train_x, test_x])
full_y = pd.concat([train_y, test_y])

In [None]:
bigger_xgb = XGBClassifier(
    n_estimators=1000,
    learning_rate=0.01,
    n_jobs=-1
    
)
bigger_xgb.fit(full_x, full_y)

In [None]:
%%time
# df_submission_raw = pd.read_csv('input/test.csv')
df_submission_raw = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

df_submission = fix_col_names(df_submission_raw)

# length, unique, stopword hyperlink, hashtags count
df_submission['length'] = df_submission['text'].str.len()
df_submission['unique_count'] = df_submission['text'].apply(lambda x: len(set(x.split())))
df_submission['stopword_count'] = get_series_stop_words_count(df['text'])
df_submission['hyperlink_count'] = get_series_hyperlink_count(df_submission['text'])
df_submission['hashtags_count'], df_submission['mentions_count'] = get_mention_hashtags_count(df_submission['text'])

# pos tagging
df_submission = df_submission.join(get_series_part_of_speech(df_submission['text']))

# ner, stemm and lemma count
df_submission['ner_count'] = df_submission['text'].apply(lambda x: get_named_entities_count(x))
df_submission['stemm_count'] = get_stemm_count(df_submission['text'])
df_submission['lemma_count'] = get_lemma_count(df_submission['text'])

df_submission = df_submission.drop(['text', 'keyword', 'location'], axis=1)

Some of the POS tags are not present in the submission set, so I'll manually add them in for the sake of running the model.

In [None]:
for col in full_x.columns:
    if col not in df_submission.columns:
        df_submission[col] = 0

In [None]:
sub_preds = bigger_xgb.predict(df_submission)

In [None]:
export_pred('bigger_xgb', df_submission, sub_preds)

# Conclusion

This submission got a score of ```0.66258``` in the public leaderboard. Not great, but ok-ish for a purely quantitative model with almost no data cleaning.

Next steps include implementing a data cleaning pipeline, more feature extraction and testing more complex models.

---