In [70]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
#pandas max width printing
pd.set_option('display.max_colwidth', -1)

In [71]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 99999

<IPython.core.display.Javascript object>

In [3]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_sub = pd.read_csv("sample_submission.csv")

In [4]:
df_sub.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [7]:
counts = Counter(df_sub["target"].values)
print(counts.most_common())

[(0, 3263)]


In [25]:
#lets randomly populate the entries for submission and check how much we can gain
tar = []
for index in range(df_sub.shape[0]):
    random_num = random.randint(0,1)
    if random_num==0:
        tar.append(0)
    else:
        tar.append(1)
df_sub["target"] = tar

In [27]:
df_sub.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1


In [29]:
df_sub.shape

(3263, 2)

In [30]:
df_sub.to_csv("submission_random.csv", index=False)

In [None]:
#random target got 0.51022 score and 909 position out of 919

In [31]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [32]:
df_train.shape

(7613, 5)

In [33]:
df_test.shape

(3263, 4)

In [34]:
df_train.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

In [35]:
df_test.columns

Index(['id', 'keyword', 'location', 'text'], dtype='object')

In [36]:
df_train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [37]:
#lot of null entries for location, few keywords are missing, lets see for test
df_test.isnull().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

In [38]:
df_train.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [43]:
#lets analyze some of the entries with target 1 and some with target 0
df_train_pos = df_train[df_train["target"]==1]
df_train_neg = df_train[df_train["target"]==0]

In [44]:
df_train_pos.shape

(3271, 6)

In [45]:
df_train_neg.shape

(4342, 6)

In [101]:
#function to quickly check if any disaster related word is mentioned in the text
def check_keyword(text):
    keywords = ["#volcano", "#earthquake", "#quake", "#hurricane", "#tornado", "#accident"]
    for word in keywords:
        if word in text or word.lower() in text.lower():
            return True
    return False

In [102]:
df_train["keyword_found"] = df_train["text"].apply(check_keyword)

In [103]:
df_train[(df_train["keyword_found"]==True) & (df_train["target"]==1)].shape

(28, 6)

In [104]:
df_train[(df_train["keyword_found"]==True) & (df_train["target"]==1)]["text"].values

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       '#Horrible #Accident Man Died In Wings Airplane (29-07-2015) #WatchTheVideo http://t.co/p64xRVgJIk',
       'This is unbelievably insane.\n#man #airport #airplane #aircraft #aeroplane #runway #accident #freaky\x89Û_ https://t.co/cezhq7CzLl',
       'Pilot Dies In Plane Crash At Car Festival https://t.co/kQ9aE6AP2B via @YouTube #Crash #Aircraft #Airplane #Pilot #Death #Accident #CarFest',
       "#OMG! I don't believe this. #RIP bro\n#AirPlane #Accident #JetEngine #TurboJet #Boing #G90 http://t.co/KXxnSZp6nk",
       '#Earthquake #Sismo M 1.4 - 4km E of Interlaken California: Time2015-08-06 00:52:25 UTC2015-08-05 17:52:25 -07... http://t.co/wA5C77F8vQ',
       '1.43 earthquake occurred near Mount St. Helens area Washington at 09:36 UTC! #earthquake http://t.co/2xMdiDGpnr',
       'Contruction upgrading ferries to earthquake standards in Vashon Mukilteo - Q13 FOX http://t.co/E981DgSkab #EarthquakeNews'

In [105]:
#only 41 have earthquake keyword when the target is 1
df_train[(df_train["keyword_found"]==True) & (df_train["target"]==0)].shape

(2, 6)

In [84]:
#looks like 6 entries have earthquake keyword but target is labeled differently, lets check them
df_train[(df_train["keyword_found"]==True) & (df_train["target"]==0)]["text"].values

array(["Posted a new song: 'Earthquake' http://t.co/RfTyyZ4GwJ http://t.co/lau0Ay7ahV",
       "@AGeekyFangirl14 's things she looks in a significant other:\n1. Beautiful eyes.\n2. Humor.\n3. Farts that creates an earthquake.\n\n????????",
       'Earthquake drill ??????',
       '@freefromwolves GodsLove &amp; #thankU brother Danny for RT of NEW VIDEO http://t.co/cybKsXHF7d The Coming Apocalyptic US Earthquake &amp; Tsunami',
       '@GreenLacey GodsLove &amp; #thankU my sister for RT of NEW VIDEO http://t.co/cybKsXHF7d The Coming Apocalyptic US Earthquake &amp; Tsunami',
       'Maailiss: Diaporama : sixpenceee: Karymsky Lake is a crater lake located in the Karymsky volcanoåÊinåÊRussia. With a\x89Û_ http://t.co/4o460Fm8HN',
       'http://t.co/Ns1AgGFNxz #shoes Asics GT-II Super Red 2.0 11 Ronnie Fieg Kith Red White 3M x gel grey volcano 2 http://t.co/oD250zshFy',
       'Diaporama : sixpenceee: Karymsky Lake is a crater lake located in the Karymsky volcanoåÊinåÊRussia. With a... htt

In [50]:
#looks like these are mentioning about earthquake but not relevant to actual earthquake event, some are saying video,
#some saying song, lets come back later

In [62]:
#lets check the keywords
keywords_pos = list(set(df_train_pos[~df_train_pos["keyword"].isnull()]["keyword"].values))

In [63]:
keywords_pos.sort()
keywords_pos

['ablaze',
 'accident',
 'airplane%20accident',
 'ambulance',
 'annihilated',
 'annihilation',
 'apocalypse',
 'armageddon',
 'army',
 'arson',
 'arsonist',
 'attack',
 'attacked',
 'avalanche',
 'battle',
 'bioterror',
 'bioterrorism',
 'blaze',
 'blazing',
 'bleeding',
 'blew%20up',
 'blight',
 'blizzard',
 'blood',
 'bloody',
 'blown%20up',
 'body%20bag',
 'body%20bagging',
 'body%20bags',
 'bomb',
 'bombed',
 'bombing',
 'bridge%20collapse',
 'buildings%20burning',
 'buildings%20on%20fire',
 'burned',
 'burning',
 'burning%20buildings',
 'bush%20fires',
 'casualties',
 'casualty',
 'catastrophe',
 'catastrophic',
 'chemical%20emergency',
 'cliff%20fall',
 'collapse',
 'collapsed',
 'collide',
 'collided',
 'collision',
 'crash',
 'crashed',
 'crush',
 'crushed',
 'curfew',
 'cyclone',
 'damage',
 'danger',
 'dead',
 'death',
 'deaths',
 'debris',
 'deluge',
 'deluged',
 'demolish',
 'demolished',
 'demolition',
 'derail',
 'derailed',
 'derailment',
 'desolate',
 'desolation',
 'de

In [59]:
#volcano is more likely related to earthquake, so lets add this to keywords list and check back, but before that lets see
#the keywords used for negative example
keywords_neg = list(set(df_train_neg[~df_train_neg["keyword"].isnull()]["keyword"].values))
keywords_neg.sort()
keywords_neg

['ablaze',
 'accident',
 'aftershock',
 'airplane%20accident',
 'ambulance',
 'annihilated',
 'annihilation',
 'apocalypse',
 'armageddon',
 'army',
 'arson',
 'arsonist',
 'attack',
 'attacked',
 'avalanche',
 'battle',
 'bioterror',
 'bioterrorism',
 'blaze',
 'blazing',
 'bleeding',
 'blew%20up',
 'blight',
 'blizzard',
 'blood',
 'bloody',
 'blown%20up',
 'body%20bag',
 'body%20bagging',
 'body%20bags',
 'bomb',
 'bombed',
 'bombing',
 'bridge%20collapse',
 'buildings%20burning',
 'buildings%20on%20fire',
 'burned',
 'burning',
 'burning%20buildings',
 'bush%20fires',
 'casualties',
 'casualty',
 'catastrophe',
 'catastrophic',
 'chemical%20emergency',
 'cliff%20fall',
 'collapse',
 'collapsed',
 'collide',
 'collided',
 'collision',
 'crash',
 'crashed',
 'crush',
 'crushed',
 'curfew',
 'cyclone',
 'damage',
 'danger',
 'dead',
 'death',
 'deaths',
 'deluge',
 'deluged',
 'demolish',
 'demolished',
 'demolition',
 'derail',
 'derailed',
 'desolate',
 'desolation',
 'destroy',
 'd

In [64]:
#lets check the difference
np.setdiff1d(keywords_pos, keywords_neg)

array(['debris', 'derailment', 'wreckage'], dtype='<U21')

In [74]:
#the keywords are extracted form the tweet itself, so lets observe the pattern of keyword and tweet itself
df_train_pos[~df_train_pos["keyword"].isnull()][["text", "keyword", "location"]].head(20)

Unnamed: 0,text,keyword,location
31,@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C,ablaze,Birmingham
33,#AFRICANBAZE: Breaking news:Nigeria flag set ablaze in Aba. http://t.co/2nndBGwyEi,ablaze,AFRICA
37,INEC Office in Abia Set Ablaze - http://t.co/3ImaomknnA,ablaze,World Wide!!
38,Barbados #Bridgetown JAMAICA ÛÒ Two cars set ablaze: SANTA CRUZ ÛÓ Head of the St Elizabeth Police Superintende... http://t.co/wDUEaj8Q4J,ablaze,
46,How the West was burned: Thousands of wildfires ablaze in California alone http://t.co/vl5TBR3wbr,ablaze,"GREENSBORO,NORTH CAROLINA"
50,Deputies: Man shot before Brighton home set ablaze http://t.co/gWNRhMSO8k,ablaze,"Sheffield Township, Ohio"
51,Man wife get six years jail for setting ablaze niece\nhttp://t.co/eV1ahOUCZA,ablaze,India
53,Police: Arsonist Deliberately Set Black Church In North CarolinaåÊAblaze http://t.co/pcXarbH9An,ablaze,Anaheim
55,#Kurds trampling on Turkmen flag later set it ablaze while others vandalized offices of Turkmen Front in #Diyala http://t.co/4IzFdYC3cg,ablaze,USA
56,TRUCK ABLAZE : R21. VOORTREKKER AVE. OUTSIDE OR TAMBO INTL. CARGO SECTION. http://t.co/8kscqKfKkF,ablaze,South Africa
