This particular challenge is perfect for data scientists looking to get started with Natural Language Processing. The competition dataset is not too big, and even if you don’t have much personal computing power, you can do all of the work in our free, no-setup, Jupyter Notebooks environment called Kaggle Notebooks.

Competition Description
Twitter has become an important communication channel in times of emergency.
The ubiquitousness of smartphones enables people to announce an emergency they’re observing in real-time. Because of this, more agencies are interested in programatically monitoring Twitter (i.e. disaster relief organizations and news agencies).

But, it’s not always clear whether a person’s words are actually announcing a disaster.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing labraries

In [None]:

import pandas as pd ## Data Processing CSV file I/O

import numpy as np ## Linear Algebra

import matplotlib.pyplot as plt ## Visualization

import seaborn as sns ## Visualization

import tensorflow as tf

import re

# Reading the data with pandas

In [None]:
df_train = pd.read_csv('../input/nlp-getting-started/train.csv')
df_test = pd.read_csv('../input/nlp-getting-started/test.csv')

# Taking a quick peak at the data

In [None]:
df_train.head()

# Lets visualize some data xD

In [None]:
## size of the plot
plt.figure(figsize = (8,5))

## grab data from the DataFrame
sns.countplot(x = 'target', data = df_train, palette = 'dark', linewidth = 5)

## Display the plot
plt.show()

In [None]:
plt.figure(figsize = (9,6))

sns.countplot(y = df_train.keyword,order = df_train['keyword'].value_counts()
              .sort_values(ascending=False).iloc[0:20].index)
plt.title("Keywords Count")

In [None]:
## Setting paremeters for disaster tweets
dis_tweet = df_train.groupby('keyword')['target'].mean().sort_values(ascending = False).head(15)

## Setting paremeters for non disaster tweets
non_dis_tweet = df_train.groupby('keyword')['target'].mean().sort_values().head(15)

## Setting up the bar plot
plt.figure(figsize = (9,6))
## for disaster keyword
sns.barplot(dis_tweet, dis_tweet.index, color = 'red')
plt.title('Words with highest % of disaster')
## for non disaster keywords
sns.barplot(non_dis_tweet, non_dis_tweet.index, color = 'blue')
plt.title('Words with lowest % disaster')

plt.show()

In [None]:
plt.figure(figsize = (14,7))

tweet_loc = df_train.location.value_counts()
top_loc_dis = list(tweet_loc[tweet_loc >= 10].index)
top_dis = df_train[df_train.location.isin(top_loc_dis)]

top_loc = top_dis.groupby('location')['target'].mean().sort_values(ascending = False)
sns.barplot( x = top_loc.index, y = top_loc)
plt.xticks(rotation = 90)

plt.show()


# Data Cleaning

In [None]:
df_train.keyword.fillna('None', inplace = True)

df_train.location.fillna('None', inplace = True)

df_train.isnull().sum()

In [None]:
def deconstruction(phrase):
    
    
    phrase = re.sub(r"won/'t", "will not", phrase)
    phrase = re.sub(r"can/'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub('\[.*?\]', ' ', phrase) 
    phrase = re.sub('https?://\S+|www\.\S+', ' ', phrase)
    phrase = re.sub('<.*?>+', ' ', phrase)
    phrase = re.sub('\n', ' ', phrase)
    phrase = re.sub('\w*\d\w*', ' ', phrase)
    
    phrase = phrase.lower()
    
    return phrase

df_train.text = [deconstruction(tweet) for tweet in df_train.text]
    
    

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

lemmatizer = WordNetLemmatizer()

ps = PorterStemmer()

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')