# NLP Disaster Tweets - EDA

Subject: Performing EDA for Twitter Disaster Tweets Dataset

Data: Twitter Disaster Tweets Dataset (NLP getting started dataset) via Kaggle
(https://www.kaggle.com/competitions/nlp-getting-started/data)

Procedure:
- Load files, replace NaNs
- Analyze and discard keyword and target features
- Display Target Variable Balance
- Analyze Text Length (Characters, Words, Punctuation, Stopwords)
- Analyze most frequent words
- Analyze bigrams and trigrams 

Others:
- Compatible with Google Colab and Kaggle as runtime

Sources used:
- https://www.kaggle.com/code/colearninglounge/nlp-data-preprocessing-and-cleaning/notebook?scriptVersionId=48903343
- https://www.kaggle.com/code/yakinoki/natural-language-processing-with-disaster-tweets
- https://www.kaggle.com/code/gunesevitan/nlp-with-disaster-tweets-eda-cleaning-and-bert/notebook

# Import Libraries

In [None]:
import os
import torch
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Running on {DEVICE}')

# running in google colab
if 'google.colab' in str(get_ipython()):
    BASE_PATH = './drive/MyDrive/Colab/data/'
    from google.colab import drive
    drive.mount('/content/drive')
    import nltk
    nltk.download('stopwords')

# running interactively in kaggle
elif get_ipython().config.IPKernelApp.connection_file.startswith('/root/.local/share'):
    BASE_PATH = '/kaggle/input/'
    
# running as background job in kaggle
elif 'SHLVL' in os.environ:
    BASE_PATH = '/kaggle/input/'

else:
    BASE_PATH = '../data/'

In [None]:
import random
import pprint
import string
from collections import Counter, defaultdict
import locale
locale.setlocale(locale.LC_ALL, locale='')  # for thousands separator via ... print(f'{value:n}')"

import pandas as pd
import torch
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import matplotlib.ticker
from matplotlib.axes._axes import Axes
from wordcloud import WordCloud
import nltk

my_seed = 42
random.seed(my_seed)
torch.manual_seed(my_seed);

# Load Data

In [None]:
df_train_source = pd.read_csv(BASE_PATH + 'nlp-getting-started/train.csv')
df_test_source = pd.read_csv(BASE_PATH + 'nlp-getting-started/test.csv')
df_train_source

In [None]:
df_train_source.shape

In [None]:
memory_usage = df_train_source.memory_usage().sum() / 1024**2
print(f'Training DataFrame Memory Usage = {memory_usage :.2f} MB')

# Missing Data

In [None]:
df_train_source.isnull().sum()

In [None]:
fig = plt.figure(figsize=(17, 4))
ax = plt.gca()  # get current axes

sns.barplot(x=df_train_source.isnull().sum().index, 
            y=df_train_source.isnull().sum().values / len(df_train_source) * 100,
            ax=ax)

plt.ylabel('Missing Values Percentage')
ax.yaxis.set_major_formatter(matplotlib.ticker.PercentFormatter())
plt.tick_params(axis='x')
plt.tick_params(axis='y')
plt.title('Missing Values')

plt.show()

## Replace NaN

In [None]:
def replace_nan(df: pd.DataFrame) -> pd.DataFrame:
    df_ = df.copy()
    df_['keyword'] = df_['keyword'].fillna('')
    df_['location'] = df_['location'].fillna('')
    return df_

df_train = replace_nan(df_train_source)
df_train

# Target Variable

### Target Labels Balance

In [None]:
ser = df_train['target']
ser.value_counts().index

In [None]:
# 0 is negative
# 1 is positive, i.e. real desaster
print(df_train['target'].value_counts())

fig,(ax1,ax2)=plt.subplots(nrows=1,
                           ncols=2,
                           figsize=(15,5))
ax1.pie(x=df_train['target'].value_counts().values,
       labels=df_train['target'].value_counts().index,
       colors=sns.color_palette('pastel'),
       autopct='%.0f%%')

sns.countplot(x=df_train['target'],
             ax=ax2,
             palette=sns.color_palette('pastel'))

# display absolute values
abs_values = df_train['target'].value_counts().values
ax2.bar_label(container=ax2.containers[0], 
              labels=abs_values)

plt.suptitle('Target Variable Distribution')
plt.show()

# Keyword and Location Features

## Distribution of Keywords

In [None]:
df_positive = df_train[df_train['target'] == 1] 
df_negative = df_train[df_train['target'] == 0] 

keywords_count_positive: list[tuple[str, int]] = list(df_positive['keyword'].value_counts().to_dict().items())[:25]
keywords_count_negative: list[tuple[str, int]] = list(df_negative['keyword'].value_counts().to_dict().items())[:25]

In [None]:
def plot_frequency(count_positive: list[tuple[str, int]],
                   count_negative: list[tuple[str, int]],
                   title = ''):

    fig,(ax1,ax2)=plt.subplots(nrows=1,
                               ncols=2,
                               figsize=(15,5))
    positive_plot = sns.barplot(x=[count for _, count in count_positive],
                                y=[word for word, _ in count_positive],
                                ax=ax1,
                                color="#A7C7E7")  # blue
    positive_plot.set(title='Positive')
    negative_plot = sns.barplot(x=[count for _, count in count_negative],
                                y=[word for word, _ in count_negative],
                                ax=ax2,
                                color="#FF6961")  # red
    negative_plot.set(title='Negative')
    
    fig.suptitle(title)
    plt.show()

plot_frequency(keywords_count_positive, keywords_count_negative, 'Keywords')

## Distribution of Location

In [None]:
location_count_positive: list[tuple[str, int]] = list(df_positive['location'].value_counts().to_dict().items())[:25]
location_count_negative: list[tuple[str, int]] = list(df_negative['location'].value_counts().to_dict().items())[:25]

plot_frequency(location_count_positive, location_count_negative, 'Locations')

## Interim Conclusion
- Keyword seems to be one of the criteria for data selection. That makes it a very dubious feature. We'll probably skip it.
- The Location variable is probably too skewed and has too much missing data to gain any relevant insight. 

# Distribution of Text Length

In [None]:
# differ between negative and positive label
ser_positive = df_train[df_train['target'] == 1]['text'].str.lower()
ser_negative = df_train[df_train['target'] == 0]['text'].str.lower()

In [None]:
def plot_count_by_label(ser_count_positive: pd.Series, 
                        ser_count_negative: pd.Series,
                        xlabel = '',
                        title = ''):

    fig,(ax1,ax2)=plt.subplots(nrows=1,
                               ncols=2,
                               figsize=(15,5))
    positive_histplot = sns.histplot(ser_count_positive,
                 ax=ax1,
                 bins=50, 
                 kde=True,
                 color="#A7C7E7")  # blue
    positive_histplot.set(xlabel = xlabel,
                          ylabel = "Tweets",
                          title='Positive Label')
    negative_histplot = sns.histplot(ser_count_negative,
                                     ax=ax2,
                                     bins=50, 
                                     kde=True,
                                     color="#FF6961")  # red
    negative_histplot.set(xlabel = xlabel,
                          ylabel = "Tweets",
                          title='Negative Label')
    fig.suptitle(title)
    plt.show()

## Number of Characters

In [None]:
len_positive = ser_positive.apply(len)
len_negative = ser_negative.apply(len)

plot_count_by_label(len_positive, len_negative, 'Number of Characters', 'Text Length (Characters) by Label')

## Number of Words

In [None]:
# a general approximation is sufficient for us here, so we don't use
# a sophisticated tokenizer but use the very simple pandas split fn

numwords_positive = ser_positive.str.split().apply(len)
numwords_negative = ser_negative.str.split().apply(len)

plot_count_by_label(numwords_positive, numwords_negative, 'Number of Words', 'Text Length (Words) by Label')

## Number of Punctuation Characters

In [None]:
# what counts as punctuation...
string.punctuation

In [None]:
punct_positive = ser_positive.apply(lambda z: len([c for c in z if c in string.punctuation]))
punct_negative = ser_negative.apply(lambda z:len([c for c in z if c in string.punctuation]))

plot_count_by_label(punct_positive, punct_negative, 'Puncutation Characters', 'Number of Punctuation Characters by Label')

## Percentage of Stopwords

In [None]:
# counting most frequently used phrases using nltk library 
# (e.g. "doing", "them", "while") 
custom_stop_words = {'http', 'https', '-', 'I', 'The', 'A', '...', '&amp;'}
stop_words = set(nltk.corpus.stopwords.words('english')) | custom_stop_words

In [None]:
def compute_stopwords_percentage(tweet: str) -> float:
    tokenized = tweet.split()
    return len([w for w in tokenized if w in stop_words]) / len(tokenized)
                                

stopw_positive = ser_positive.apply(compute_stopwords_percentage)
stopw_negative = ser_negative.apply(compute_stopwords_percentage)

plot_count_by_label(stopw_positive, stopw_negative, 'Percentage of Stopwords', 'Percentage of Stopwords by Label')

## Number of URLs

In [None]:
urls_positive = ser_positive.apply(lambda words: len([w for w in words.split() if 'http' in w or 'https' in w]))
urls_negative = ser_negative.apply(lambda words: len([w for w in words.split() if 'http' in w or 'https' in w]))

plot_count_by_label(urls_positive, urls_negative, 'Number of URLs', 'Number of URLs by Label')

## Number of Hashtags (#)

In [None]:
hashtags_positive = ser_positive.apply(lambda t: len([c for c in t if c == '#']))
hashtags_negative = ser_negative.apply(lambda t: len([c for c in t if c == '#']))

plot_count_by_label(hashtags_positive, hashtags_negative, 'Number of Hashtags',  'Number of Hashtags by Label')

# n-gram Analysis

## Most frequent Words

In [None]:
def get_most_frequent_words(ser: pd.Series):
    # create one list of words for the whole series
    corpus = [word.lower() for tokenized in ser.str.split() for word in tokenized if word not in stop_words]
        
    # create a list of words with their respective count
    counter = Counter(corpus)
    most_common: list[tuple[str, int]] = counter.most_common(30)
    return most_common

most_common_positive = get_most_frequent_words(ser_positive)
print(most_common_positive[:5])

most_common_negative = get_most_frequent_words(ser_negative)
print(most_common_negative[:5])

In [None]:
plot_frequency(most_common_positive, most_common_negative, 'Most frequent Words')

### WordCloud
Looks nice, but basically useless. Still, we plot some WordClouds...

In [None]:
# wordcloud.WordCloud Visualization
def display_cloud(ser_positive: pd.Series, ser_negative: pd.Series):
    
    fig,(ax1,ax2)=plt.subplots(nrows=1,
                               ncols=2,
                               figsize=(15,5))
    
    wc_positive = WordCloud(stopwords=stop_words,
                   background_color="white",
                   random_state=my_seed,
                  )
    wc_positive.generate(' '.join(ser_positive))
    ax1.imshow(wc_positive,
               interpolation="bilinear")
    ax1.axis('off')
    ax1.set(title='Positive Label')
    
    wc_negative = WordCloud(stopwords=stop_words,
                   background_color="white",
                   random_state=my_seed
                  )
    wc_negative.generate(' '.join(ser_negative))
    ax2.imshow(wc_negative,
               interpolation="bilinear")
    ax2.axis('off')
    ax2.set(title='Negative Label')

    plt.show()

display_cloud(df_train[df_train['target'] == 1]['text'],
              df_train[df_train['target'] == 0]['text'])

## 2-gram

In [None]:
def gram_analysis(review: str,
                  n_gram: int) -> list[str]:  # returns a list of strings, each containing n_gram word tokens 
                                              # (stopwords ignored)
    tokens=[t for t in review.lower().split(" ") if t!="" if t not in stop_words]
    ngrams=zip(*[tokens[i:] for i in range(n_gram)])
    final_tokens=[" ".join(z) for z in ngrams]
    return final_tokens

# Example:
# gram_analysis(review='Why can\'t a movie be rated a zero? Or even a negative number? Some movie rated 1 is 
# so bad they\'re fun to watch.', n_gram=2)
# --> ['can't movie', 'movie rated', 'rated zero?', 'zero? even', 'even negative', 'negative number?', 
#      'number? movie', 'movie rated', 'rated 1', '1 bad', "bad they're", "they're fun", 'fun watch.']

In [None]:
#Create frequency grams for analysis
def get_frequency_dict(ser_reviews: pd.Series,
                n_gram: int) -> dict[str, int]:
    frequency_dict = defaultdict(int)
    for sentence in ser_reviews:
        for tokens in gram_analysis(sentence, n_gram):
            frequency_dict[tokens]+=1
    return dict(frequency_dict)

# Example:
# ser = pd.Series(['Why can\'t a movie be rated a zero? Or even a negative number?',
#                  'Some movie rated 1 is so bad they\'re fun to watch.'])
# create_dict(ser, n_gram=2)
# --> defaultdict(<class 'int'>, {'can't movie': 1, 'movie rated': 2, 'rated zero?': 1, 'zero? even': 1, 
#                                'even negative': 1, 'negative number?': 1, 'rated 1': 1, '1 bad': 1, "bad they're": 1, "they're fun": 1, 'fun watch.': 1})

In [None]:
bigram_frequency_positive = get_frequency_dict(ser_positive,
                                          n_gram=2)
bigram_frequency_negative = get_frequency_dict(ser_negative,
                                          n_gram=2)

print(f'Found a total of {len(bigram_frequency_positive) :n} distinct bigrams in positive reviews.')
print(f'Found a total of {len(bigram_frequency_negative) :n} distinct bigrams in negative reviews.')

In [None]:
def barplot_on_axes(frequency_dict: dict[str, int], 
                    title: str,
                    ax: Axes,
                    color: str):
    
    # convert dict (gram-as-string to count) to list of tuples (gram, count) in descending order by count
    sorted_n_grams = sorted(frequency_dict.items(),
                            key=lambda z:z[1],
                            reverse=True)
    sorted_n_grams = sorted_n_grams[:25]
    
    barplot = sns.barplot(x=[b[0] for b in sorted_n_grams], 
                y=[b[1] for b in sorted_n_grams], 
                ax=ax,
                color=color)  # blue
    
    ax.tick_params(axis='x', 
                   rotation=90)
    barplot.set(xlabel='n-grams',
                ylabel="Tweets",
                title=title)

In [None]:
fig,(ax1,ax2)=plt.subplots(nrows=1,
                           ncols=2,
                           figsize=(15,5))
barplot_on_axes(bigram_frequency_positive, 'Positive Label', ax1, color="#A7C7E7")  # blue
barplot_on_axes(bigram_frequency_negative, 'Negative Label', ax2, color="#FF6961")  # red
fig.suptitle('Most frequent 2-grams')

plt.show()

## 3-gram

In [None]:
trigram_frequency_positive = get_frequency_dict(ser_positive,
                                          n_gram=3)
trigram_frequency_negative = get_frequency_dict(ser_negative,
                                          n_gram=3)

fig,(ax1,ax2)=plt.subplots(nrows=1,
                           ncols=2,
                           figsize=(15,5))
barplot_on_axes(trigram_frequency_positive, 'Positive Label', ax1, color="#A7C7E7")  # blue
barplot_on_axes(trigram_frequency_negative, 'Negative Label', ax2, color="#FF6961")  # red
fig.suptitle('Most frequent 3-grams')

plt.show()