In [1]:
import pandas as pd

In [2]:
import nltk

In [3]:
from nltk.tokenize import word_tokenize

In [4]:
import re

In [5]:
train_data = pd.read_csv("./nlp-getting-started/train.csv")

In [6]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


# Number of records
Total records: 7613  
Null keyword: 61  (0.8% is missing)  
Null location: 2533 (33.3% is missing)

In [7]:
n_records = train_data.shape[0]
n_records

7613

In [8]:
n_miss_key = train_data[train_data['keyword'].isnull()].shape[0]
print(f"Missing keyword records: {n_miss_key}, proportion:{n_miss_key / n_records * 100:.1f} %")

Missing keyword records: 61, proportion:0.8 %


In [9]:
n_miss_loc = train_data[train_data['location'].isnull()].shape[0]
print(f"Missing keyword records: {n_miss_loc}, proportion:{n_miss_loc / n_records * 100:.1f} %")

Missing keyword records: 2533, proportion:33.3 %


## Duplicated data
61 tweets are duplicated  

In [10]:
train_data[train_data[['keyword','location','text']].duplicated()]

Unnamed: 0,id,keyword,location,text,target
48,68,ablaze,Live On Webcam,Check these out: http://t.co/rOI2NSmEJJ http:/...,0
115,165,aftershock,US,320 [IR] ICEMOON [AFTERSHOCK] | http://t.co/vA...,0
119,172,aftershock,Switzerland,320 [IR] ICEMOON [AFTERSHOCK] | http://t.co/TH...,0
164,238,airplane%20accident,,Experts in France begin examining airplane deb...,1
624,898,bioterrorism,,To fight bioterrorism sir.,0
...,...,...,...,...,...
6377,9113,suicide%20bomb,Nigeria,#Bestnaijamade: 16yr old PKK suicide bomber wh...,1
6378,9114,suicide%20bomb,Nigeria,#Bestnaijamade: 16yr old PKK suicide bomber wh...,1
6392,9135,suicide%20bomb,Nigeria,#Bestnaijamade: 16yr old PKK suicide bomber wh...,1
6449,9225,suicide%20bombing,,'Suicide bombing at [location named]...' #prem...,1


remove duplicated data

In [11]:
train_data = train_data[~train_data[['keyword','location','text']].duplicated()]

In [12]:
train_data.shape

(7552, 5)

# Keyword

222 types of keywords exists  
some keywords has % but otherwise, they seems to be proper data.

In [13]:
train_data['keyword'].unique(), len(train_data['keyword'].unique())

(array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
        'ambulance', 'annihilated', 'annihilation', 'apocalypse',
        'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
        'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
        'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
        'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
        'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
        'buildings%20burning', 'buildings%20on%20fire', 'burned',
        'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
        'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
        'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
        'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
        'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
        'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       

To reduce duplicated data, let's remitize the keywords

In [14]:
from nltk.stem import WordNetLemmatizer

In [15]:
lemmatizer = WordNetLemmatizer()

In [16]:
train_data['keyword'] = train_data['keyword'].fillna('')

In [17]:
train_data['keyword'] = train_data['keyword'].str.replace('%', ' ')

remove digit from keywords

In [18]:
train_data['keyword'] = train_data['keyword'].str.replace('\d', '', regex=True)

Lemmatize the keywords

In [19]:
train_data['keyword_lem'] = train_data['keyword'].apply(lambda x: lemmatizer.lemmatize(x))

In [20]:
train_data['keyword_lem'].unique(), len(train_data['keyword_lem'].unique())

(array(['', 'ablaze', 'accident', 'aftershock', 'airplane accident',
        'ambulance', 'annihilated', 'annihilation', 'apocalypse',
        'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
        'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
        'blazing', 'bleeding', 'blew up', 'blight', 'blizzard', 'blood',
        'bloody', 'blown up', 'body bag', 'body bagging', 'body bags',
        'bomb', 'bombed', 'bombing', 'bridge collapse',
        'buildings burning', 'buildings on fire', 'burned', 'burning',
        'burning buildings', 'bush fires', 'casualty', 'catastrophe',
        'catastrophic', 'chemical emergency', 'cliff fall', 'collapse',
        'collapsed', 'collide', 'collided', 'collision', 'crash',
        'crashed', 'crush', 'crushed', 'curfew', 'cyclone', 'damage',
        'danger', 'dead', 'death', 'debris', 'deluge', 'deluged',
        'demolish', 'demolished', 'demolition', 'derail', 'derailed',
        'derailment', 'desolate', 'des

In [187]:
key_pivot = pd.pivot_table(train_data[['keyword_lem','id','target']], index='keyword_lem', columns='target', aggfunc='count')
key_pivot = key_pivot.fillna(0)

In [189]:
key_pivot.sort_values(by=('id',0), ascending=False)

Unnamed: 0_level_0,id,id
target,0,1
keyword_lem,Unnamed: 1_level_2,Unnamed: 2_level_2
siren,57.0,12.0
injury,50.0,18.0
weapon,47.0,31.0
fatality,46.0,36.0
body bags,40.0,1.0
...,...,...
suicide bomber,1.0,30.0
suicide bombing,1.0,31.0
debris,0.0,37.0
wreckage,0.0,39.0


Target can be separated by keyword  
Some keywords such as debris, wreckage indicates that the tweet is about disaster.  

# Location

3342 locations exists

In [21]:
train_data['location'].unique(), len(train_data['location'].unique())

(array([nan, 'Birmingham', 'Est. September 2012 - Bristol', ...,
        'Vancouver, Canada', 'London ', 'Lincoln'], dtype=object),
 3342)

In [22]:
train_data[['location']].sample(10)

Unnamed: 0,location
1992,"Columbia, SC"
5637,Skyhold
3008,SD |Norway| KSA
2407,The Desert of the Real
546,"Danville, VA"
6120,
1086,
5631,Melbourne Australia
2232,World
3468,New York


In [23]:
remove_special = re.compile("[.;:!\'?,\"()\d\[\]<>%#\^&=@\+\-\$\#\\/*]")

In [24]:
import numpy as np

In [25]:
import math

In [26]:
def loc_prep(data):
    data = data.lower()
    data = remove_special.sub(' ', data)
    data = re.sub(' +', ' ', data)
    return data

In [27]:
train_data['location'] = train_data['location'].fillna(' ')

In [28]:
train_data['location_prep'] = train_data['location'].apply(loc_prep)

In [29]:
train_data['location_prep'].unique(), len(train_data['location_prep'].unique())

(array([' ', 'birmingham', 'est september bristol', ..., 'denton texas',
        ' newcastleupontyne uk', 'lincoln'], dtype=object),
 3075)

location data is not accurate.  
some location data such as jupiter, everywhere seems no meaning.  
Also there are many records that has no location.  
It would be better to focus on keywords and text  

In [30]:
train_data[['location_prep']].sample(10)

Unnamed: 0,location_prep
3534,south africa
6693,
1717,
1252,concord nh
2824,new york city
6935,north carolina
2853,
559,dallas tx
1773,denver colorado
2614,


In [31]:
loc_pivot = pd.pivot_table(train_data[['location_prep','id','target']], index='location_prep', columns='target', aggfunc='count')
loc_pivot = loc_pivot.fillna(0)

location does not show big difference to identify target

In [32]:
loc_pivot.sort_values(by=('id',0), ascending=False).loc[loc_pivot[('id',0)]>=10]

Unnamed: 0_level_0,id,id
target,0,1
location_prep,Unnamed: 1_level_2,Unnamed: 2_level_2
,1496.0,1088.0
new york,58.0,17.0
usa,37.0,67.0
london,32.0,17.0
united states,23.0,27.0
los angeles ca,20.0,8.0
canada,16.0,17.0
kenya,16.0,5.0
everywhere,14.0,5.0
uk,13.0,17.0


# Text
I referenced 'https://www.kaggle.com/code/rohitgarud/all-almost-data-preprocessing-techniques-for-nlp' to preprocessed text data.

In [33]:
pd.set_option('display.max_colwidth', None)

Needs to be cleaned up   
* 'urls'
* 'mention'
* 'tags', 
* abbreviation
* HTML

In [34]:
train_data[['text']].sample(10)

Unnamed: 0,text
3835,As firefighters make gains on #RockyFire Jerry Brown is heading to the area to meet with first responders tomorrow morning
7017,#Breaking144 Obama Declares Disaster for Typhoon-Devastated Saipan: Obama signs disaster declarat... http://t.co/M8CIKs60BX #AceNewsDesk
7366,When I breathe it sounds like a windstorm. Haha cool
2440,Service on the Green Line has resumed after an earlier derailment near Garfield with residual delays.
5909,I can't listen to Darude Sandstorm without expecting airhorns now
4115,Gotta love #summer in #Calgary. #yyc #hailstorm #crazyweather http://t.co/xQbWnLBBIu
5486,Yet another company trying to censor the Internet. Reddit has started to quarantine their content: http://t.co/pG4y3I5ciu #cc
738,@burberryant bleeding on the brain don't know the cause
2598,I always felt like the Namekians were black people and felt played when they died and the planet got destroyed ??
4122,IG: http://t.co/2WBiVKzJIP 'It's hailing again! #abstorm #yyc #hail #hailstorm #haildamage #yycweather #calgary #captureyyc #alberta #stoÛ_


### lower case

In [35]:
train_data['text_lower'] = train_data['text'].str.lower()

### contractions  
expand contractions such as can't -> can not

In [36]:
import contractions

In [38]:
train_data['text_noContractions'] = train_data['text_lower'].apply(contractions.fix)

In [40]:
train_data.loc[train_data['text_lower'].str.contains("can't"), ['text_lower', 'text_noContractions']].sample(3)

Unnamed: 0,text_lower,text_noContractions
6244,@cacheadvance besides your nasty thunderstorm or snowstorm nah. can't say that i have.,@cacheadvance besides your nasty thunderstorm or snowstorm nah. cannot say that i have.
6893,@malabamiandsons she's proper traumatised that pepper is 'dead' i can't wait to see her face,@malabamiandsons she is proper traumatised that pepper is 'dead' i cannot wait to see her face
4961,@nprfreshair i really can't believe he is skipping out before the republican meltdown...i mean 'debate'.,@nprfreshair i really cannot believe he is skipping out before the republican meltdown...i mean 'debate'.


### urls  

In [41]:
def rem_urls(data):
    regex = re.compile(f"https?://(www\.)?(\w+)(\.\w+)(/\w*)?")
    data = re.sub(regex, "", data)
    return data

In [42]:
train_data['text_noURLs'] = train_data['text_noContractions'].apply(rem_urls)

In [43]:
train_data.loc[train_data['text_noContractions'].str.contains('http'), ['text_noContractions', 'text_noURLs']].sample(3)

Unnamed: 0,text_noContractions,text_noURLs
522,i bet you did not know i kick box too! https://t.co/rbrw8pwipj,i bet you did not know i kick box too!
397,mourning notices for stabbing arson victims stir û÷politics of griefûª in israel: posters for shira banki and a... http://t.co/3gz5zqqthe,mourning notices for stabbing arson victims stir û÷politics of griefûª in israel: posters for shira banki and a...
3618,let us fraction the vital need for our fatalities. how would you break it down in #education #econom http://t.co/zsqm8ihe1k,let us fraction the vital need for our fatalities. how would you break it down in #education #econom


### emails
one records in train dataset has email

In [90]:
train_data[train_data['text_noURLs'].str.contains("([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+", regex=True)]

  train_data[train_data['text_noURLs'].str.contains("([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+", regex=True)]


Unnamed: 0,id,keyword,location,text,target,keyword_lem,location_prep,text_lower,text_noContractions,text_noURLs,text_noEmail
2254,3229,deluged,Wellington,@TheSewphist whoever holds the address 'fuckface@wineisdumb.com' is going to be deluged in spam meant for me,0,deluged,wellington,@thesewphist whoever holds the address 'fuckface@wineisdumb.com' is going to be deluged in spam meant for me,@thesewphist whoever holds the address 'fuckface@wineisdumb.com' is going to be deluged in spam meant for me,@thesewphist whoever holds the address 'fuckface@wineisdumb.com' is going to be deluged in spam meant for me,@thesewphist whoever holds the address '' is going to be deluged in spam meant for me


In [91]:
def rem_emails(data):
    regex = re.compile("([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+")
    data = re.sub(regex, "", data)
    return data

In [92]:
train_data['text_noEmail'] = train_data['text_noURLs'].apply(rem_emails)

In [94]:
train_data.loc[2254, ['text_noURLs', 'text_noEmail']]

text_noURLs     @thesewphist whoever holds the address 'fuckface@wineisdumb.com' is going to be deluged in spam meant for me
text_noEmail                           @thesewphist whoever holds the address '' is going to be deluged in spam meant for me
Name: 2254, dtype: object

### HTML

In [95]:
from bs4 import BeautifulSoup

In [96]:
train_data['text_noHTML'] = train_data['text_noEmail'].apply(lambda x: BeautifulSoup(x).get_text())



In [97]:
train_data.loc[train_data['text_noEmail'].str.contains('&gt'), ['text_noEmail', 'text_noHTML']].sample(3)

Unnamed: 0,text_noEmail,text_noHTML
2268,they have come back!! &gt;&gt; flying ant day: capital deluged by annual swarm of winged insects,they have come back!! >> flying ant day: capital deluged by annual swarm of winged insects
6555,today your life could change forever - #chronicillness cannot be avoided - it can be survived\n\njoin #mylifestory &gt;&gt;&gt;,today your life could change forever - #chronicillness cannot be avoided - it can be survived\n\njoin #mylifestory >>>
6661,@kinggerudo_ to the largest moblin's he would leaving the biggest one for red and fired. with one strike already the threat was reduced --&gt;,@kinggerudo_ to the largest moblin's he would leaving the biggest one for red and fired. with one strike already the threat was reduced -->


### Mentions (@)
Tweeter uses @ to call naems of people, so it is not helpful to classify disaster  
It should be done before removing @ alone.

In [98]:
def rem_mention(data):
    regex = re.compile('@\w+')
    data = re.sub(regex, '', data)
    return data

In [99]:
train_data['text_noMention'] = train_data['text_noHTML'].apply(rem_mention)

In [100]:
train_data.loc[train_data['text_noHTML'].str.contains('@\w+', regex=True), ['text_noHTML', 'text_noMention']].sample(3)

Unnamed: 0,text_noHTML,text_noMention
5156,@klavierstuk does not so lvg is forced into the market. may beat spurs and smaller teams with blind lcb. top 4/ cl teams will obliterate us.,does not so lvg is forced into the market. may beat spurs and smaller teams with blind lcb. top 4/ cl teams will obliterate us.
2490,@corleonedaboss there is not anything there its desolate because of its nature. the significance is that we were the first country to do it,there is not anything there its desolate because of its nature. the significance is that we were the first country to do it
1843,heard #skh on the radio for the first time. almost crashed the car. @5sos @ashton5sos @luke5sos @michael5sos @calum5sos,heard #skh on the radio for the first time. almost crashed the car.


### Emojis
reference says this should be discussed, and I would like to apply some emojis conversion  

In [146]:
train_data.loc[train_data['text_noMention'].str.contains(r'\;\)', regex=True)].sample(3)

Unnamed: 0,id,keyword,location,text,target,keyword_lem,location_prep,text_lower,text_noContractions,text_noURLs,text_noEmail,text_noHTML,text_noMention,text_noEmoji
5395,7696,panicking,"Petaluma, CA",@QuotesTTG Save the panicking for when you get to Helios. ;),0,panicking,petaluma ca,@quotesttg save the panicking for when you get to helios. ;),@quotesttg save the panicking for when you get to helios. ;),@quotesttg save the panicking for when you get to helios. ;),@quotesttg save the panicking for when you get to helios. ;),@quotesttg save the panicking for when you get to helios. ;),save the panicking for when you get to helios. ;),save the panicking for when you get to helios. happiness
3753,5332,fire,"St.Cloud, MN",Dear @CanonUSAimaging I brought it ;) #CanonBringIt #Fire #CanonTattoo #MN #TheresMoreWhereThatCameFrom http://t.co/tCXxHdJAs6,0,fire,st cloud mn,dear @canonusaimaging i brought it ;) #canonbringit #fire #canontattoo #mn #theresmorewherethatcamefrom http://t.co/tcxxhdjas6,dear @canonusaimaging i brought it ;) #canonbringit #fire #canontattoo #mn #theresmorewherethatcamefrom http://t.co/tcxxhdjas6,dear @canonusaimaging i brought it ;) #canonbringit #fire #canontattoo #mn #theresmorewherethatcamefrom,dear @canonusaimaging i brought it ;) #canonbringit #fire #canontattoo #mn #theresmorewherethatcamefrom,dear @canonusaimaging i brought it ;) #canonbringit #fire #canontattoo #mn #theresmorewherethatcamefrom,dear i brought it ;) #canonbringit #fire #canontattoo #mn #theresmorewherethatcamefrom,dear i brought it happiness #canonbringit #fire #canontattoo #mn #theresmorewherethatcamefrom
21,32,,,London is cool ;),0,,,london is cool ;),london is cool ;),london is cool ;),london is cool ;),london is cool ;),london is cool ;),london is cool happiness


In [137]:
train_data['text_noEmoji'] = train_data['text_noMention'].str.replace(':\(', 'sadness ')

  train_data['text_noEmoji'] = train_data['text_noMention'].str.replace(':\(', 'sadness ')


In [138]:
train_data['text_noEmoji'] = train_data['text_noEmoji'].str.replace(r':\)[$|\s]*', 'happiness ')

  train_data['text_noEmoji'] = train_data['text_noEmoji'].str.replace(r':\)[$|\s]*', 'happiness ')


In [145]:
train_data['text_noEmoji'] = train_data['text_noEmoji'].str.replace(r'\;\)[$|\s]*', 'happiness ')

  train_data['text_noEmoji'] = train_data['text_noEmoji'].str.replace(r'\;\)[$|\s]*', 'happiness ')


### Accent

In [148]:
from unidecode import unidecode

In [149]:
def rem_accent(data):
    data = unidecode(data)
    return data

In [150]:
train_data['text_noAccent'] = train_data['text_noEmoji'].apply(rem_accent)

### Unicode Characters
If tweet includes other languages, this approach might be wrong.

In [151]:
def rem_unicode(data):
    data = data.encode("ascii", "ignore").decode()
    return data

In [152]:
train_data['text_noUnicode'] = train_data['text_noAccent'].apply(rem_unicode)

### Punctuations

In [153]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [154]:
def rem_punc(data):
    data = re.sub(f"[{string.punctuation}]", " ", data)
    return data

In [155]:
train_data['text_noPunct'] = train_data['text_noUnicode'].apply(rem_punc)

In [156]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target,keyword_lem,location_prep,text_lower,text_noContractions,text_noURLs,text_noEmail,text_noHTML,text_noMention,text_noEmoji,text_noAccent,text_noUnicode,text_noPunct
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,,,our deeds are the reason of this #earthquake may allah forgive us all,our deeds are the reason of this #earthquake may allah forgive us all,our deeds are the reason of this #earthquake may allah forgive us all,our deeds are the reason of this #earthquake may allah forgive us all,our deeds are the reason of this #earthquake may allah forgive us all,our deeds are the reason of this #earthquake may allah forgive us all,our deeds are the reason of this #earthquake may allah forgive us all,our deeds are the reason of this #earthquake may allah forgive us all,our deeds are the reason of this #earthquake may allah forgive us all,our deeds are the reason of this earthquake may allah forgive us all
1,4,,,Forest fire near La Ronge Sask. Canada,1,,,forest fire near la ronge sask. canada,forest fire near la ronge sask. canada,forest fire near la ronge sask. canada,forest fire near la ronge sask. canada,forest fire near la ronge sask. canada,forest fire near la ronge sask. canada,forest fire near la ronge sask. canada,forest fire near la ronge sask. canada,forest fire near la ronge sask. canada,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,,,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected,all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1,,,"13,000 people receive #wildfires evacuation orders in california","13,000 people receive #wildfires evacuation orders in california","13,000 people receive #wildfires evacuation orders in california","13,000 people receive #wildfires evacuation orders in california","13,000 people receive #wildfires evacuation orders in california","13,000 people receive #wildfires evacuation orders in california","13,000 people receive #wildfires evacuation orders in california","13,000 people receive #wildfires evacuation orders in california","13,000 people receive #wildfires evacuation orders in california",13 000 people receive wildfires evacuation orders in california
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,,,just got sent this photo from ruby #alaska as smoke from #wildfires pours into a school,just got sent this photo from ruby #alaska as smoke from #wildfires pours into a school,just got sent this photo from ruby #alaska as smoke from #wildfires pours into a school,just got sent this photo from ruby #alaska as smoke from #wildfires pours into a school,just got sent this photo from ruby #alaska as smoke from #wildfires pours into a school,just got sent this photo from ruby #alaska as smoke from #wildfires pours into a school,just got sent this photo from ruby #alaska as smoke from #wildfires pours into a school,just got sent this photo from ruby #alaska as smoke from #wildfires pours into a school,just got sent this photo from ruby #alaska as smoke from #wildfires pours into a school,just got sent this photo from ruby alaska as smoke from wildfires pours into a school


### Numbers
numbers mights have some meaning, so replace them with '#'

In [163]:
def clean_numbers(data):
    data = re.sub('[0-9]{5,}', '#'*5, data)
    data = re.sub('[0-9]{4}', '#'*4, data)
    data = re.sub('[0-9]{3}', '#'*3, data)
    data = re.sub('[0-9]{2}', '#'*2, data)
    return data

In [164]:
train_data['text_noDigit'] = train_data['text_noPunct'].apply(clean_numbers)

In [167]:
train_data.loc[train_data['text_noPunct'].str.contains('\d'), ['text_noPunct', 'text_noDigit']].sample(3)

Unnamed: 0,text_noPunct,text_noDigit
3383,my school just put the evacuation alarms on accidently with 2 different trial exams happening are you kidding me,my school just put the evacuation alarms on accidently with 2 different trial exams happening are you kidding me
3568,update 1 russian food crematoria provoke outrage amid crisis famine memories russian society still recal,update 1 russian food crematoria provoke outrage amid crisis famine memories russian society still recal
2164,malaysia airlines flight 370 that disappeared 17months ago debris found south of the indian ocean,malaysia airlines flight ### that disappeared ##months ago debris found south of the indian ocean


### Stopwords

In [168]:
from nltk.corpus import stopwords

In [169]:
stop_words = set(stopwords.words('english'))

In [173]:
def rem_stopwords(data):
    return " ".join([word for word in str(data).split() if word not in stop_words])

In [178]:
train_data['text_noStop'] = train_data['text_noDigit'].apply(rem_stopwords)

### Extra space

In [175]:
def rem_extra_space(data):
    data = re.sub(' +', ' ', data).strip()
    return data

In [179]:
train_data['text_noExspace'] = train_data['text_noStop'].apply(rem_extra_space)

In [180]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target,keyword_lem,location_prep,text_lower,text_noContractions,text_noURLs,text_noEmail,text_noHTML,text_noMention,text_noEmoji,text_noAccent,text_noUnicode,text_noPunct,text_noDigit,text_noStop,text_noExspace
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,,,our deeds are the reason of this #earthquake may allah forgive us all,our deeds are the reason of this #earthquake may allah forgive us all,our deeds are the reason of this #earthquake may allah forgive us all,our deeds are the reason of this #earthquake may allah forgive us all,our deeds are the reason of this #earthquake may allah forgive us all,our deeds are the reason of this #earthquake may allah forgive us all,our deeds are the reason of this #earthquake may allah forgive us all,our deeds are the reason of this #earthquake may allah forgive us all,our deeds are the reason of this #earthquake may allah forgive us all,our deeds are the reason of this earthquake may allah forgive us all,our deeds are the reason of this earthquake may allah forgive us all,deeds reason earthquake may allah forgive us,deeds reason earthquake may allah forgive us
1,4,,,Forest fire near La Ronge Sask. Canada,1,,,forest fire near la ronge sask. canada,forest fire near la ronge sask. canada,forest fire near la ronge sask. canada,forest fire near la ronge sask. canada,forest fire near la ronge sask. canada,forest fire near la ronge sask. canada,forest fire near la ronge sask. canada,forest fire near la ronge sask. canada,forest fire near la ronge sask. canada,forest fire near la ronge sask canada,forest fire near la ronge sask canada,forest fire near la ronge sask canada,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,,,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected,all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected,all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected,residents asked shelter place notified officers evacuation shelter place orders expected,residents asked shelter place notified officers evacuation shelter place orders expected
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1,,,"13,000 people receive #wildfires evacuation orders in california","13,000 people receive #wildfires evacuation orders in california","13,000 people receive #wildfires evacuation orders in california","13,000 people receive #wildfires evacuation orders in california","13,000 people receive #wildfires evacuation orders in california","13,000 people receive #wildfires evacuation orders in california","13,000 people receive #wildfires evacuation orders in california","13,000 people receive #wildfires evacuation orders in california","13,000 people receive #wildfires evacuation orders in california",13 000 people receive wildfires evacuation orders in california,## ### people receive wildfires evacuation orders in california,## ### people receive wildfires evacuation orders california,## ### people receive wildfires evacuation orders california
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,,,just got sent this photo from ruby #alaska as smoke from #wildfires pours into a school,just got sent this photo from ruby #alaska as smoke from #wildfires pours into a school,just got sent this photo from ruby #alaska as smoke from #wildfires pours into a school,just got sent this photo from ruby #alaska as smoke from #wildfires pours into a school,just got sent this photo from ruby #alaska as smoke from #wildfires pours into a school,just got sent this photo from ruby #alaska as smoke from #wildfires pours into a school,just got sent this photo from ruby #alaska as smoke from #wildfires pours into a school,just got sent this photo from ruby #alaska as smoke from #wildfires pours into a school,just got sent this photo from ruby #alaska as smoke from #wildfires pours into a school,just got sent this photo from ruby alaska as smoke from wildfires pours into a school,just got sent this photo from ruby alaska as smoke from wildfires pours into a school,got sent photo ruby alaska smoke wildfires pours school,got sent photo ruby alaska smoke wildfires pours school


### Lemmatization

In [181]:
from nltk.stem import WordNetLemmatizer

In [182]:
lemmatizer = WordNetLemmatizer()

In [183]:
def lemmatize_data(data):
    words = [lemmatizer.lemmatize(word) for word in data.split()]
    data = ' '.join(words)
    return data

In [184]:
train_data['text_Lemmatised'] = train_data['text_noExspace'].apply(lemmatize_data)

In [186]:
train_data.sample(3)

Unnamed: 0,id,keyword,location,text,target,keyword_lem,location_prep,text_lower,text_noContractions,text_noURLs,...,text_noHTML,text_noMention,text_noEmoji,text_noAccent,text_noUnicode,text_noPunct,text_noDigit,text_noStop,text_noExspace,text_Lemmatised
7322,10482,wild fires,Canada,@WBCShirl2 Yes God doessnt change he says not to rejoice over the fall of people or calamities like wild fires ect you wanna be punished?,0,wild fires,canada,@wbcshirl2 yes god doessnt change he says not to rejoice over the fall of people or calamities like wild fires ect you wanna be punished?,@wbcshirl2 yes god doessnt change he says not to rejoice over the fall of people or calamities like wild fires ect you want to be punished?,@wbcshirl2 yes god doessnt change he says not to rejoice over the fall of people or calamities like wild fires ect you want to be punished?,...,@wbcshirl2 yes god doessnt change he says not to rejoice over the fall of people or calamities like wild fires ect you want to be punished?,yes god doessnt change he says not to rejoice over the fall of people or calamities like wild fires ect you want to be punished?,yes god doessnt change he says not to rejoice over the fall of people or calamities like wild fires ect you want to be punished?,yes god doessnt change he says not to rejoice over the fall of people or calamities like wild fires ect you want to be punished?,yes god doessnt change he says not to rejoice over the fall of people or calamities like wild fires ect you want to be punished?,yes god doessnt change he says not to rejoice over the fall of people or calamities like wild fires ect you want to be punished,yes god doessnt change he says not to rejoice over the fall of people or calamities like wild fires ect you want to be punished,yes god doessnt change says rejoice fall people calamities like wild fires ect want punished,yes god doessnt change says rejoice fall people calamities like wild fires ect want punished,yes god doessnt change say rejoice fall people calamity like wild fire ect want punished
6038,8631,seismic,"Madison, Wisconsin, USA",#OilandGas Exploration Takes Seismic Shift in #Gabon to #Somalia http://t.co/oHHolJ9vEV via @business,1,seismic,madison wisconsin usa,#oilandgas exploration takes seismic shift in #gabon to #somalia http://t.co/ohholj9vev via @business,#oilandgas exploration takes seismic shift in #gabon to #somalia http://t.co/ohholj9vev via @business,#oilandgas exploration takes seismic shift in #gabon to #somalia via @business,...,#oilandgas exploration takes seismic shift in #gabon to #somalia via @business,#oilandgas exploration takes seismic shift in #gabon to #somalia via,#oilandgas exploration takes seismic shift in #gabon to #somalia via,#oilandgas exploration takes seismic shift in #gabon to #somalia via,#oilandgas exploration takes seismic shift in #gabon to #somalia via,oilandgas exploration takes seismic shift in gabon to somalia via,oilandgas exploration takes seismic shift in gabon to somalia via,oilandgas exploration takes seismic shift gabon somalia via,oilandgas exploration takes seismic shift gabon somalia via,oilandgas exploration take seismic shift gabon somalia via
372,533,army,,Beyonce Is my pick for http://t.co/nnMQlz91o9 Fan Army #Beyhive http://t.co/o91f3cYy0R 66,0,army,,beyonce is my pick for http://t.co/nnmqlz91o9 fan army #beyhive http://t.co/o91f3cyy0r 66,beyonce is my pick for http://t.co/nnmqlz91o9 fan army #beyhive http://t.co/o91f3cyy0r 66,beyonce is my pick for fan army #beyhive 66,...,beyonce is my pick for fan army #beyhive 66,beyonce is my pick for fan army #beyhive 66,beyonce is my pick for fan army #beyhive 66,beyonce is my pick for fan army #beyhive 66,beyonce is my pick for fan army #beyhive 66,beyonce is my pick for fan army beyhive 66,beyonce is my pick for fan army beyhive ##,beyonce pick fan army beyhive ##,beyonce pick fan army beyhive ##,beyonce pick fan army beyhive ##


### Spell checker

# Combine keywords and text

In the dataset module, I will follow above preprocessing process

In [191]:
train_data['tweet'] = train_data['keyword_lem'] + ' ' + train_data['text_Lemmatised']

In [202]:
train_data[['tweet']].sample(10)

Unnamed: 0,tweet
4593,injury 4 common running injury avoid
677,blaze artisteoftheweekfact say conversation coast2coastdjs agree jiwonle hiphop clubbanger
5728,rescuer rescuer searching hundred migrant mediterranean boat carrying many ### people capsized coast ofu
1377,bush fires attack woman health attack america health hillary clinton show standwithpp
1397,casualty warfighting robot could reduce civilian casualty calling
6873,trauma ## faculty member pushing university overturn ban trauma center protester
1067,bomb guy scared show real name anyway know bomb
866,blood another day another excellent porridge seriously people blood orange porridge phenomenal
3271,engulfed tube strike live latest travel update london engulfed chaos genuine baffling telegraph headline
4066,forest fires forest fire could delay official say could good thing


## Word tokens of text

In [17]:
train_data['n_tokens'] = train_data['text'].apply(lambda x: len(word_tokenize(x)))

In [19]:
train_data['n_tokens'].describe()

count    7613.000000
mean       18.914226
std         6.871321
min         1.000000
25%        14.000000
50%        19.000000
75%        24.000000
max        72.000000
Name: n_tokens, dtype: float64

In [None]:
train_data['']