In [13]:
import os 
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from itertools import islice
from sklearn.naive_bayes import MultinomialNB
import pickle

In [14]:
nlp=pd.read_csv("E:/datasets/nlp-getting-started/train.csv")

In [15]:
nlp

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [16]:
nlp.shape

(7613, 5)

In [17]:
nlp.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [18]:
nlp.keyword.value_counts()

fatalities               45
deluge                   42
armageddon               42
harm                     41
sinking                  41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64

In [19]:
stops = stopwords.words('english')

In [20]:
stops[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [21]:
##REGEX FOR DATE REMOVAL
#import re
txt='Monday, April, 4, 2016, 10:16:06'
re1='(Monday)'	# Day Of Week 1
re2='.*?'	# Non-greedy match on filler
re3='(April)'	# Month 1
re4='.*?'	# Non-greedy match on filler
re5='(4)'	# Day 1
re6='.*?'	# Non-greedy match on filler
re7='(2016)'	# Year 1
re8='.*?'	# Non-greedy match on filler
re9='(10:16:06)'	# HourMinuteSec 1

rg = re.compile(re1+re2+re3+re4+re5+re6+re7+re8+re9,re.IGNORECASE|re.DOTALL)
m = rg.search(txt)
if m:
    dayofweek1=m.group(1)
    month1=m.group(2)
    day1=m.group(3)
    year1=m.group(4)
    time1=m.group(5)

In [22]:
#import re
re1='(b)'	# Variable Name 1
re2='.*?'	# Non-greedy match on filler
re3='(4\\/04\\/2016)'	# DDMMYYYY 1

rd = re.compile(re1+re2+re3,re.IGNORECASE|re.DOTALL)
m = rg.search(txt)
if m:
    var1=m.group(1)
    ddmmyyyy1=m.group(2)

In [23]:
#import re
txt='\\\\r\\\\r____________________________\\\\r'

re1='(\\\\\\\\r\\\\\\\\r____________________________\\\\\\\\r)'	# Windows UNC 1

rt = re.compile(re1,re.IGNORECASE|re.DOTALL)
m = rg.search(txt)
if m:
    unc1=m.group(1)

In [24]:
#import nltk
def _remove_noise(input_text):
    input_text = str(input_text).encode('ascii', 'ignore')
    input_text = str(input_text).replace(",", "")
    input_text = str(input_text).replace("\'\\", "")
    input_text = re.sub(rg, ' ', input_text)
    #input_text = re.sub([[:punct:]], '', input_text) -- this is a step in R 
    input_text = re.sub(rd, ' ', input_text)
    input_text = re.sub(rt, ' ', input_text)
    words = str(input_text).split()
    pos_words = nltk.pos_tag(words)
    noise_free_words = [i[0] for i in pos_words if i[1] in ('NN')]
    noise_free_words = [word for word in noise_free_words if word.lower() not in stops]
    return noise_free_words

In [25]:
nlp["cleaned"] = nlp.text.apply(_remove_noise)
nlp.head()

Unnamed: 0,id,keyword,location,text,target,cleaned
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[b'Our, #earthquake]"
1,4,,,Forest fire near La Ronge Sask. Canada,1,[fire]
2,5,,,All residents asked to 'shelter in place' are ...,1,"[b""All, place', officers., evacuation, shelter..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,[b'13000]
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[b'Just, photo, smoke, school]"


In [26]:
#from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nlp['stemmed'] = nlp.cleaned.map(lambda x: ' '.join([lemmatizer.lemmatize(y) for y in x]))
nlp.stemmed.head()

0                                  b'Our #earthquake
1                                               fire
2    b"All place' officers. evacuation shelter place
3                                            b'13000
4                          b'Just photo smoke school
Name: stemmed, dtype: object

In [27]:
cvec = CountVectorizer(stop_words= stops, min_df=1, max_df=.5, ngram_range=(1,2))


In [28]:
#from itertools import islice
cvec.fit(nlp.stemmed)

list(islice(cvec.vocabulary_.items(), 20))

[('earthquake', 11394),
 ('fire', 12691),
 ('place', 21132),
 ('officers', 20143),
 ('evacuation', 11939),
 ('shelter', 23904),
 ('place officers', 21144),
 ('officers evacuation', 20144),
 ('evacuation shelter', 11955),
 ('shelter place', 23906),
 ('13000', 132),
 ('photo', 20990),
 ('smoke', 24357),
 ('school', 23522),
 ('photo smoke', 21007),
 ('smoke school', 24371),
 ('rockyfire', 23008),
 ('cafire', 3954),
 ('wildfires', 28357),
 ('rockyfire fire', 23010)]

In [29]:
len(cvec.vocabulary_)

29564

In [30]:
cvec = CountVectorizer(stop_words=stops, min_df=.001, max_df=.99, ngram_range=(1,2))
cvec.fit(nlp.stemmed)
len(cvec.vocabulary_)

676

In [31]:
cvec_counts = cvec.transform(nlp.stemmed)
print ('sparse matrix shape:', cvec_counts.shape)
print ('nonzero count:', cvec_counts.nnz)
print ('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))

sparse matrix shape: (7613, 676)
nonzero count: 23486
sparsity: 0.46%


In [32]:
transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(cvec_counts)
transformed_weights

<7613x676 sparse matrix of type '<class 'numpy.float64'>'
	with 23486 stored elements in Compressed Sparse Row format>

In [33]:
weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
124,co,0.143565
310,http,0.14121
311,http co,0.141135
312,https,0.022024
313,https co,0.022024
235,fire,0.012091
630,video,0.008427
177,disaster,0.007631
589,time,0.0072
158,day,0.006605


In [34]:
target = nlp["target"]

In [35]:
nb = MultinomialNB()

In [36]:
nb.fit(transformed_weights, target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [37]:
nlp1=pd.read_csv("E:/datasets/nlp-getting-started/test.csv")
nlp2=pd.read_csv("E:/datasets/nlp-getting-started/sample_submission.csv")

In [38]:
nlp3=pd.merge(nlp1,nlp2,how='inner')
nlp3.head()

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,0
1,2,,,"Heard about #earthquake is different cities, s...",0
2,3,,,"there is a forest fire at spot pond, geese are...",0
3,9,,,Apocalypse lighting. #Spokane #wildfires,0
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,0


In [39]:
nlp3["cleaned"] = nlp3.text.apply(_remove_noise)
nlp3.head()

Unnamed: 0,id,keyword,location,text,target,cleaned
0,0,,,Just happened a terrible car crash,0,"[b'Just, car, crash']"
1,2,,,"Heard about #earthquake is different cities, s...",0,"[b'Heard, #earthquake, everyone.']"
2,3,,,"there is a forest fire at spot pond, geese are...",0,"[fire, spot, pond, geese, street]"
3,9,,,Apocalypse lighting. #Spokane #wildfires,0,"[b'Apocalypse, #wildfires']"
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,0,[b'Typhoon]


In [40]:
#from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nlp3['stemmed'] = nlp3.cleaned.map(lambda x: ' '.join([lemmatizer.lemmatize(y) for y in x]))
nlp3.stemmed.head()

0                 b'Just car crash'
1    b'Heard #earthquake everyone.'
2       fire spot pond goose street
3          b'Apocalypse #wildfires'
4                         b'Typhoon
Name: stemmed, dtype: object

In [41]:
cvec1 = CountVectorizer(stop_words= stops, min_df=0.001, max_df=0.9, ngram_range=(1,2))

In [42]:
cvec1.fit(nlp3.stemmed)
list(islice(cvec1.vocabulary_.items(), 20))

[('car', 98),
 ('crash', 132),
 ('earthquake', 185),
 ('everyone', 199),
 ('fire', 222),
 ('spot', 514),
 ('street', 530),
 ('apocalypse', 25),
 ('typhoon', 587),
 ('life', 320),
 ('yesterday', 641),
 ('fuck', 236),
 ('http', 285),
 ('co', 115),
 ('fire http', 223),
 ('http co', 286),
 ('ablaze', 5),
 ('crisis', 136),
 ('check', 108),
 ('world', 630)]

In [43]:
cvec1 = CountVectorizer(stop_words=stops, min_df=.001, max_df=.8, ngram_range=(1,2))
cvec1.fit(nlp3.stemmed)
len(cvec1.vocabulary_)

644

In [44]:
cvec_counts1 = cvec.transform(nlp3.stemmed)
print ('sparse matrix shape:', cvec_counts1.shape)
print ('nonzero count:', cvec_counts1.nnz)
print ('sparsity: %.2f%%' % (100.0 * cvec_counts1.nnz / (cvec_counts1.shape[0] * cvec_counts1.shape[1])))

sparse matrix shape: (3263, 676)
nonzero count: 9728
sparsity: 0.44%


In [45]:
transformer1 = TfidfTransformer()
transformed_weights1 = transformer.fit_transform(cvec_counts1)
transformed_weights1

<3263x676 sparse matrix of type '<class 'numpy.float64'>'
	with 9728 stored elements in Compressed Sparse Row format>

In [46]:
preds = nb.predict(transformed_weights1)

In [47]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [48]:
confusion_matrix(nlp3["target"], preds)

array([[2434,  829],
       [   0,    0]], dtype=int64)

In [49]:
accuracy_score(nlp3["target"], preds, normalize=True)

0.7459393196444989