## Imports

In [None]:
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', None)

import re

from wordcloud import WordCloud
import contractions

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
plt.rcParams['font.size'] = 15

import nltk
from nltk.stem.porter import PorterStemmer
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

## Data Load

In [None]:
df_train = pd.read_csv('../Datasets/disaster_tweet/train.csv')
df_train.head(20)

In [None]:
df_train.tail(20)

### Observation

1. Mixed case
2. Contractions
3. Hashtags and mentions
4. Incorrect spellings
5. Punctuations
6. websites and urls

## Functions

In [None]:
all_text = ' '.join(list(df_train['text']))

def check_texts(check_item, all_text):
    return check_item in all_text

In [None]:
print(check_texts('<a', all_text))
print(check_texts('<div', all_text))
print(check_texts('<p', all_text))

In [None]:
print(check_texts(':)', all_text))
print(check_texts('<3', all_text))
print(check_texts('heard', all_text))

In [None]:
def remove_urls(text):
    ''' This method takes in text to remove urls and website links, if any'''
    url_pattern = r'(www.|http[s]?://)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    text = re.sub(url_pattern, '', text)
    return text

def remove_html_entities(text):
    ''' This method removes html tags'''
    html_entities = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'
    text = re.sub(html_entities, '', text)
    return text

def convert_lower_case(text):
    return text.lower()

def detect_news(text):
    if 'news' in text:
        text = text + ' news'
    return text

def remove_social_media_tags(text):
    ''' This method removes @ and # tags'''
    tag_pattern = r'@([a-z0-9]+)|#'
    text = re.sub(tag_pattern, '', text)
    return text

# Count it before I remove them altogether
def count_punctuations(text):
    getpunctuation = re.findall('[.?"\'`\,\-\!:;\(\)\[\]\\/“”]+?',text)
    return len(getpunctuation)


def preprocess_text(x):
    cleaned_text = re.sub(r'[^a-zA-Z\d\s]+', '', x)
    word_list = []
    for each_word in cleaned_text.split(' '):
        word_list.append(contractions.fix(each_word).lower())
    word_list = [porter_stemmer.stem(each_word.replace('\n', '').strip()) for each_word in word_list]
    word_list = set(word_list) - set(STOPWORDS)
    return " ".join(word_list)

In [None]:
porter_stemmer = PorterStemmer()

df_train['text'] = df_train['text'].apply(remove_urls)
df_train['text'] = df_train['text'].apply(remove_html_entities)
df_train['text'] = df_train['text'].apply(convert_lower_case)
df_train['text'] = df_train['text'].apply(detect_news)
df_train['text'] = df_train['text'].apply(remove_social_media_tags)
df_train['punctuation_count'] = df_train['text'].apply(count_punctuations)
df_train['text'] = df_train['text'].apply(preprocess_text)

df_train['text_tokenized'] = df_train['text'].apply(word_tokenize)
df_train['words_per_tweet'] = df_train['text_tokenized'].apply(len)

In [None]:
df_train

## Tweet Length Analysis

In [None]:
sns.histplot(x='words_per_tweet', hue='target', data=df_train, kde=True)
plt.show()

## Punctuation Analysis

In [None]:
sns.countplot(x='target', hue='punctuation_count', data=df_train)
plt.legend([])
plt.show()

## Tweet Text Analysis using WordCloud

In [None]:
real_disaster_tweets = ' '. join(list(df_train[df_train['target'] == 1]['text']))

In [None]:
non_real_disaster_tweets = ' '. join(list(df_train[df_train['target'] == 0]['text']))

In [None]:
wc = WordCloud(background_color="black", 
               max_words=100, 
               width=1000, 
               height=600, 
               random_state=1).generate(real_disaster_tweets)

plt.figure(figsize=(15,15))
plt.imshow(wc)
plt.axis("off")
plt.title("Wordcloud of Tweets about Real Disasters")
plt.show()

In [None]:
wc = WordCloud(background_color="black", 
               max_words=100, 
               width=1000, 
               height=600,
               font_step=1,
               random_state=1).generate(non_real_disaster_tweets)

plt.figure(figsize=(15,15))
plt.imshow(wc)
plt.axis("off")
plt.title("Wordcloud of Tweets NOT about Real Disasters")
plt.show()