# Natural Language Processing with Disaster Tweets

In [30]:
# importing all required packages

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time, re
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn import feature_extraction, linear_model
import seaborn as sns
from sklearn.model_selection import train_test_split

## EDA

In [45]:
# reading training data

train_df = pd.read_csv('nlp-getting-started/train.csv')
print('Number of data points : ', train_df.shape[0])
print('Number of features : ', train_df.shape[1])
print('Features : ', train_df.columns.values)
train_df

Number of data points :  7613
Number of features :  5
Features :  ['id' 'keyword' 'location' 'text' 'target']


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [32]:
# reading test data

test_df = pd.read_csv('nlp-getting-started/test.csv')
print('Number of data points : ', test_df.shape[0])
print('Number of features : ', test_df.shape[1])
print('Features : ', test_df.columns.values)
test_df

Number of data points :  3263
Number of features :  4
Features :  ['id' 'keyword' 'location' 'text']


Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [33]:
# checking the details of features provided

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [34]:
train_df['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [35]:
train_df.nunique()

id          7613
keyword      221
location    3341
text        7503
target         2
dtype: int64

#### Observations:

1. Training data is balanced.
2. There are no tweets which has empty text feature
3. There are few tweets which have duplicate texts
4. There is a big chunk of data where location of the user is not mentioned


## Text pre-processing

In [36]:
train_df['text']=train_df['text'].apply(lambda x: x.lower())

In [37]:
train_df['text']=train_df['text'].apply(lambda x: x.replace("#", ""))

#### Removing URLs

In [38]:
# function for url removal

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)
train_df['text']=train_df['text'].apply(lambda x : remove_URL(x))

In [46]:
def remove_mention(text):
    return re.sub("@[A-Za-z0-9_]+","", text)
train_df['text']=train_df['text'].apply(lambda x : remove_mention(x))

In [47]:
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,The out of control wild fires in California ...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


#### Stemming

In [41]:
# train_df['text'] = train_df['text'].apply(lambda x: [stemmer.stem(y) for y in x.split()])

In [42]:
# function for stemming
import nltk
stemmer = nltk.porter.PorterStemmer()
def get_stem(df):
    for ind in df.index:
        df['text'][ind] = ' '.join([stemmer.stem(word) for word in df['text'][ind].split()])
    return df

In [43]:
get_stem(train_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'][ind] = ' '.join([stemmer.stem(word) for word in df['text'][ind].split()])


Unnamed: 0,id,keyword,location,text,target
0,1,,,our deed are the reason of thi earthquak may a...,1
1,4,,,forest fire near la rong sask. canada,1
2,5,,,all resid ask to 'shelter in place' are be not...,1
3,6,,,"13,000 peopl receiv wildfir evacu order in cal...",1
4,7,,,just got sent thi photo from rubi alaska as sm...,1
...,...,...,...,...,...
7608,10869,,,two giant crane hold a bridg collaps into near...,1
7609,10870,,,_ahrari the out of control wild fire in califo...,1
7610,10871,,,m1.94 [01:04 utc]?5km s of volcano hawaii.,1
7611,10872,,,polic investig after an e-bik collid with a ca...,1


In [44]:
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deed are the reason of thi earthquak may a...,1
1,4,,,forest fire near la rong sask. canada,1
2,5,,,all resid ask to 'shelter in place' are be not...,1
3,6,,,"13,000 peopl receiv wildfir evacu order in cal...",1
4,7,,,just got sent thi photo from rubi alaska as sm...,1
...,...,...,...,...,...
7608,10869,,,two giant crane hold a bridg collaps into near...,1
7609,10870,,,_ahrari the out of control wild fire in califo...,1
7610,10871,,,m1.94 [01:04 utc]?5km s of volcano hawaii.,1
7611,10872,,,polic investig after an e-bik collid with a ca...,1


#### Remove identifier

#### Remove duplicates from our dataset

In [14]:
indices = []
for i,row in train_df.iterrows():
    indices.append(i)

In [15]:
import itertools
stage1_dedupe_id = []
i = 0
j = 0
num_data_points = train_df.shape[0]
while i < num_data_points and j < num_data_points:
    
    previous_i = i

    # store the list of words of ith string in a, ex: a = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']
    a = train_df['text'].loc[indices[i]].split()

    # search for the similar products sequentially 
    j = i+1
    while j < num_data_points:

        # store the list of words of jth string in b, ex: b = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'Small']
        b = train_df['text'].loc[indices[j]].split()

        # store the maximum length of two strings
        length = max(len(a), len(b))

        # count is used to store the number of words that are matched in both strings
        count  = 0

        # itertools.zip_longest(a,b): will map the corresponding words in both strings, it will appened None in case of unequal strings
        # example: a =['a', 'b', 'c', 'd']
        # b = ['a', 'b', 'd']
        # itertools.zip_longest(a,b): will give [('a','a'), ('b','b'), ('c','d'), ('d', None)]
        for k in itertools.zip_longest(a,b): 
            if (k[0] == k[1]):
                count += 1

        # if the number of words in which both strings differ are > 2 , we are considering it as those two apperals are different
        # if the number of words in which both strings differ are < 2 , we are considering it as those two apperals are same, hence we are ignoring them
        if (length - count) > 2: # number of words in which both sensences differ
            # if both strings are differ by more than 2 words we include the 1st string index
            stage1_dedupe_id.append(train_df['id'].loc[indices[i]])


            # start searching for similar apperals corresponds 2nd string
            i = j
            break
        else:
            print('1st text : ','i ',i, train_df['text'][i])
            print('2nd text : ','j ',j, train_df['text'][j])
            j += 1
    if previous_i == i:
        break

1st text :  i  114 320 [ir] icemoon [aftershock] | | @djicemoon | dubstep trapmus dnb edm danc icesû_
2nd text :  j  115 320 [ir] icemoon [aftershock] | | @djicemoon | dubstep trapmus dnb edm danc icesû_
1st text :  i  114 320 [ir] icemoon [aftershock] | | @djicemoon | dubstep trapmus dnb edm danc icesû_
2nd text :  j  116 320 [ir] icemoon [aftershock] | | @djicemoon | dubstep trapmus dnb edm danc icesû_
1st text :  i  118 320 [ir] icemoon [aftershock] | | @djicemoon | dubstep trapmus dnb edm danc icesû_
2nd text :  j  119 320 [ir] icemoon [aftershock] | | @djicemoon | dubstep trapmus dnb edm danc icesû_
1st text :  i  190 twelv fear kill in pakistani air ambul helicopt crash
2nd text :  j  191 twelv fear kill in pakistani air ambul helicopt crash
1st text :  i  192 ambul sprinter automat frontlin vehicl choic of 14 lez compliant | ebay
2nd text :  j  193 ambul sprinter automat frontlin vehicl choic of 14 lez compliant | ebay
1st text :  i  201 twelv fear kill in pakistani air am

1st text :  i  5299 famili to sue over legionnaires: more than 40 famili affect by the fatal outbreak of legionnaires' disea...
2nd text :  j  5300 famili to sue over legionnaires: more than 40 famili affect by the fatal outbreak of legionnaires' disea...
1st text :  i  5304 famili to sue over legionnaires: more than 40 famili affect by the fatal outbreak of legionnaires' disea...
2nd text :  j  5305 famili to sue over legionnaires: more than 40 famili affect by the fatal outbreak of legionnaires' disea...
1st text :  i  5304 famili to sue over legionnaires: more than 40 famili affect by the fatal outbreak of legionnaires' disea...
2nd text :  j  5306 famili to sue over legionnaires: more than 40 famili affect by the fatal outbreak of legionnaires' disea...
1st text :  i  5304 famili to sue over legionnaires: more than 40 famili affect by the fatal outbreak of legionnaires' disea...
2nd text :  j  5307 famili to sue over legionnaires: more than 40 famili affect by the fatal outbreak of

In [16]:
train_df_dedupe = train_df.loc[train_df['id'].isin(stage1_dedupe_id)]
train_df_dedupe

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deed are the reason of thi earthquak may a...,1
1,4,,,forest fire near la rong sask. canada,1
2,5,,,all resid ask to 'shelter in place' are be not...,1
3,6,,,"13,000 peopl receiv wildfir evacu order in cal...",1
4,7,,,just got sent thi photo from rubi alaska as sm...,1
...,...,...,...,...,...
7607,10867,,,stormchas violent record break ef-5 el reno ok...,1
7608,10869,,,two giant crane hold a bridg collaps into near...,1
7609,10870,,,@aria_ahrari @thetawniest the out of control w...,1
7610,10871,,,m1.94 [01:04 utc]?5km s of volcano hawaii.,1


In [17]:
train_df_dedupe.shape

(7441, 5)

#### Remove Stopwords

In [18]:
# removing stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print(ENGLISH_STOP_WORDS)

def nlp_preprocessing(total_text, index, column):
    if type(total_text) is not int:
        string = ""
        for words in total_text.split():
            # remove the special chars in review like '"#$@!%^&*()_+-~?>< etc.
            word = ("".join(e for e in words if e.isalnum()))
            # Conver all letters to lower-case
            word = word.lower()
            # stop-word removal
            if not word in ENGLISH_STOP_WORDS:
                string += word + " "
        train_df_dedupe[column][index] = string

frozenset({'ltd', 'full', 'whom', 'with', 'thick', 'else', 'rather', 'then', 'un', 'fire', 'others', 'nor', 'wherein', 'front', 'alone', 'same', 'again', 'become', 'nine', 'cant', 'nowhere', 'former', 'three', 'within', 'four', 'put', 'be', 'who', 'now', 'other', 'therefore', 'becoming', 'afterwards', 'above', 'beyond', 'seem', 'whole', 'eight', 'themselves', 'something', 'find', 'herself', 'it', 'the', 'her', 'enough', 'so', 'i', 'after', 'am', 'latter', 'thru', 'five', 'us', 'when', 'somewhere', 'before', 'thereafter', 'could', 'either', 'yourself', 'system', 'but', 'well', 'empty', 'than', 'yet', 'however', 'what', 're', 'do', 'whether', 'wherever', 'sometime', 'name', 'own', 'or', 'anything', 'will', 'whereafter', 'myself', 'take', 'formerly', 'part', 'another', 'none', 'fill', 'also', 'is', 'twelve', 'whenever', 'which', 'me', 'whence', 'any', 'toward', 'seeming', 'everything', 'its', 'etc', 'whatever', 'every', 'ours', 'this', 'go', 'both', 'neither', 'on', 'due', 'describe', 'th

In [19]:
start_time = time.time()
# we take each title and we text-preprocess it.
for index, row in train_df_dedupe.iterrows():
    nlp_preprocessing(row['text'], index, 'text')
# we print the time it took to preprocess whole titles 
print('Removed stopwords in',time.time() - start_time, "seconds")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_dedupe[column][index] = string


Removed stopwords in 2.33223557472229 seconds


In [20]:
train_df_dedupe.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,deed reason thi earthquak allah forgiv,1
1,4,,,forest near la rong sask canada,1
2,5,,,resid ask shelter place notifi officers evacu ...,1
3,6,,,13000 peopl receiv wildfir evacu order califor...,1
4,7,,,just got sent thi photo rubi alaska smoke wild...,1
5,8,,,rockyfir updat california hwy 20 close direct...,1
6,10,,,flood disast heavi rain caus flash flood stree...,1
7,13,,,im hill woods,1
8,14,,,emerg evacu happen build street,1
9,15,,,im afraid tornado come area,1


In [21]:
count_vectorizer = feature_extraction.text.CountVectorizer()

In [22]:
train_vectors = count_vectorizer.fit_transform(train_df_dedupe["text"])

In [23]:
train_vectors

<7441x15949 sparse matrix of type '<class 'numpy.int64'>'
	with 64642 stored elements in Compressed Sparse Row format>

In [24]:
test_vectors = count_vectorizer.transform(test_df["text"])

In [25]:
clf = linear_model.RidgeClassifier()

### Train Test Set Split

In [26]:
# targets

Y_train = train_df_dedupe['target']
Y_train.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [27]:
# Splitting and training
train_inputs, test_inputs, train_targets, test_targets = train_test_split(X_train_temp, Y_train_temp, test_size=0.25, random_state=42)

NameError: name 'X_train_temp' is not defined