In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


#Load Data

input = pd.read_csv("../input/disaster-tweets/tweets.csv")


print(input.shape)

In [None]:
#Exploration

input.head(5)

In [None]:
#checking for any duplicated rows

input.duplicated().sum()

There are no duplicated rows

In [None]:
#Checking for missing values in the data

input.isna().sum()

In [None]:
#Class distribution

plt.figure(figsize=(6, 7))
plot= sns.countplot(input["target"], 
                   order = input["target"].value_counts().index)

for p in plot.patches:
    plot.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')

The data has more "Non-disaster" tweets

In [None]:
#Exploring Keywords and their distribution

print(input.keyword.nunique())

#Most common Keywords

plt.figure(figsize = (9,6))
sns.countplot(y= input.keyword, order = input.keyword.value_counts().iloc[:10].index)
plt.title('Top 10 Keywords')
plt.show()

Let's explore if the top keywords change wrt Disaster and Non-disaster tweets

In [None]:
disaster = input[input.target == 1]
print(disaster.shape)

nondisaster = input[input.target ==0]
print(nondisaster.shape)

kw_dis = disaster.keyword.value_counts().head(10)
kw_nondis = nondisaster.keyword.value_counts().head(10)

plt.figure(figsize=(18,6))
plt.subplot(121)
sns.barplot(kw_dis, kw_dis.index, color = 'g')
plt.title('Top Keywords for Disaster tweets')

plt.subplot(122)
sns.barplot(kw_nondis, kw_nondis.index, color = 'b')
plt.title('Top Keywords for Non-Disaster tweets')

We see that top 10 tweets for either classes are not the same.

In [None]:
# Exploring 'Location' data

#Top 10 locations w.r.t number of tweets

plt.figure(figsize = (8,6))
sns.countplot(y = input.location, order = input.location.value_counts().iloc[:10].index)
plt.title("Top 10 Locations based on No. of Tweets")

plt.show()

We see that the 'Location' data is not clean. It has Countries and cities mixed with each other and there is no standardization ( there is USA and United States )

In [None]:
#Top locations with highest percentage of disaster tweets

#filtering tweets originating from top 10 locations
top_loc = list(input.location.value_counts().iloc[:10].index)
top_loc_tweets = input[input.location.isin(top_loc)]


loc_dis = top_loc_tweets.groupby('location').mean()['target'].sort_values(ascending = False)

plt.figure(figsize = (10,6))
sns.barplot(x= loc_dis.index, y = loc_dis)
plt.title('% of disaster tweets for top 10 locations')
plt.xticks(rotation = 85)
plt.axhline(np.mean(input.target))
plt.show()




UK, US and India (along with versions of them) are the location with highest percentage of disaster tweets

Cleaning up text column to remove any links, whitespaces and create a 'clean' column. Create separate columns for hashtags and mentions.



In [None]:
import re

def text_clean(text):
    text = re.sub(r'https?://\S+', '', text) #removing links
    text = re.sub(r'\n', ' ', text) #removing line breaks
    text = re.sub(r'\s+', ' ', text).strip() #removing extra spaces
    return text

In [None]:
def hash_tags(text):
    return " ".join([match.group(0)[1:] for match in re.finditer(r"#\w+", text)]) or 'no'
                    
def mentions(text):
    return " ".join([match.group(0)[1:] for match in re.finditer(r"@\w+", text)]) or 'no'
                    

def tweet_process(df):
    
    df['clean_text'] = df['text'].apply(lambda x: text_clean(x))
    df['hashtags'] = df['text'].apply(lambda x: hash_tags(x))
    df['mentions'] = df['text'].apply(lambda x: mentions(x))
    return df
                    
input = tweet_process(input)
print(input.head(5))

Let's try to find out top hashtags. Also the percentage of disaster tweets for them.

In [None]:
# Top 10 hashtags 

plt.figure(figsize=(10,6))

sns.countplot(y=input.hashtags, order = input.hashtags.value_counts().iloc[1:11].index) #ignoring 'No' which would be the top row in 'hashtags'
plt.title("Top 10 hashtags")

plt.show()


#Percentage of disaster tweets

top_hashtags = input.hashtags.value_counts().iloc[1:11].index
ht_tweets = input[input.hashtags.isin(top_hashtags)]
top_ht = ht_tweets.groupby('hashtags').mean()['target'].sort_values(ascending=False)


plt.figure(figsize = (10,6))
sns.barplot(x= top_ht.index, y = top_ht)
plt.xticks(rotation=85)
plt.show()


Finding the most frequent words and bi-grams

In [None]:
from nltk import word_tokenize, FreqDist
from wordcloud import STOPWORDS

stopwords = set(STOPWORDS)




In [None]:

word_freq = FreqDist(w for w in word_tokenize(' '.join(input['clean_text']).lower()) if 
                     (w not in stopwords) & (w.isalpha()))


df_freq = pd.DataFrame.from_dict(word_freq, orient = 'index', columns = ['count'])
top20w = df_freq.sort_values('count',ascending=False).head(20)
print(top20w)

We can see that the list is not clean.

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(y=top20w.index, x= top20w['count'])
plt.title("Most frequent words")
plt.show()

We try to see how the most frequent words change for disaster and non-disaster tweets

In [None]:
plt.figure(figsize = (18,6))
plt.subplot(121)


word_freq1 = FreqDist(w for w in word_tokenize(' '.join(input.loc[input.target==1,'clean_text']).lower()) if 
                     (w not in stopwords) & (w.isalpha()))


df_freq1 = pd.DataFrame.from_dict(word_freq1, orient = 'index', columns = ['count'])
top20w1 = df_freq1.sort_values('count',ascending=False).head(20)
sns.barplot(y=top20w1.index, x= top20w1['count'])
plt.title("Most frequent words for disaster tweets")


plt.subplot(122)

word_freq2 = FreqDist(w for w in word_tokenize(' '.join(input.loc[input.target==0,'clean_text']).lower()) if 
                     (w not in stopwords) & (w.isalpha()))

df_freq2 = pd.DataFrame.from_dict(word_freq2, orient = 'index', columns = ['count'])
top20w2 = df_freq2.sort_values('count',ascending=False).head(20)
sns.barplot(y=top20w2.index, x= top20w2['count'])
plt.title("Most frequent words for non-disaster tweets")

plt.show()

We can see that in disaster tweets, words such as fire, volcano and thunderstorm appear. For non-disaster tweets, there is no such pattern. Let's now check, if there are more clear patterns in bigrams.

In [None]:
from nltk import bigrams

plt.figure(figsize = (22,10))
plt.subplot(121)


bigram_1 = list(bigrams([w for w in word_tokenize(' '.join(input.loc[input.target==1, 'clean_text']).lower()) if 
              (w not in stopwords) & (w.isalpha())]))

word_freq_b1 = FreqDist(x for x in bigram_1)
df_freq_b1 = pd.DataFrame.from_dict(word_freq_b1, orient = 'index', columns = ['count'])
df_freq_b1.index = df_freq_b1.index.map(lambda x: ' '.join(x))

top20w_b1 = df_freq_b1.sort_values('count',ascending=False).head(20)
sns.barplot(y=top20w_b1.index, x= top20w_b1['count'])
plt.title("Most frequent bigrams for disaster tweets")


plt.subplot(122)

bigram_2 = list(bigrams([w for w in word_tokenize(' '.join(input.loc[input.target==0, 'clean_text']).lower()) if 
              (w not in stopwords) & (w.isalpha())]))

word_freq_b2 = FreqDist(x for x in bigram_2)
df_freq_b2 = pd.DataFrame.from_dict(word_freq_b2, orient = 'index', columns = ['count'])
df_freq_b2.index = df_freq_b2.index.map(lambda x: ' '.join(x))

top20w_b2 = df_freq_b2.sort_values('count',ascending=False).head(20)
sns.barplot(y=top20w_b2.index, x= top20w_b2['count'])
plt.title("Most frequent bigrams for non-disaster tweets")


plt.show()

Most bigrams of the disaster tweets are in line with expectations. With 'taal volcano' and 'severe thunderstorms' being the most frequent. There is no pattern in the non-disaster tweets although we do find some related bigrams like 'taal volcano', ' heat wave' and 'bush fires'. But they are not at the top of the list.