In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize



# Preparing training data

In [3]:
df_train = pd.read_csv(r'G:\Coding ninjas\Kaggle projects & submissions\Disaster tweets\train.csv')
df_train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [5]:
Xtrain = df_train['text']
Ytrain = df_train['target']

In [7]:
Xd_tr = [word_tokenize(i) for i in Xtrain]
Xd_tr[0]

['Our',
 'Deeds',
 'are',
 'the',
 'Reason',
 'of',
 'this',
 '#',
 'earthquake',
 'May',
 'ALLAH',
 'Forgive',
 'us',
 'all']

In [8]:
stop = set(stopwords.words('english'))
punct = list(string.punctuation)
stop.update(punct)

In [9]:
lem = WordNetLemmatizer()


def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    
    elif tag.startswith('N'):
        return wordnet.NOUN
    
    elif tag.startswith('V'):
        return wordnet.VERB
    
    elif tag.startswith('R'):
        return wordnet.ADV
    
    else:
        return wordnet.NOUN
    
    
def clean(words):
    output = []
    
    for w in words:
        if w.lower() not in stop:
            pos = pos_tag([w])
            
            clean_word = lem.lemmatize(w, get_simple_pos(pos[0][1]))
            
            output.append(clean_word.lower())
            
    return output



In [11]:
Xc_tr = [clean(comment) for comment in Xd_tr]
Xc_tr[0]

['deeds', 'reason', 'earthquake', 'may', 'allah', 'forgive', 'u']

In [13]:
Xf_tr = [' '.join(comment) for comment in Xc_tr]
Xf_tr[0]

'deeds reason earthquake may allah forgive u'

# Preparing testing data

In [14]:
df_test = pd.read_csv(r'G:\Coding ninjas\Kaggle projects & submissions\Disaster tweets\test.csv')
df_test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [15]:
Xtest = df_test['text']

In [16]:
ids = df_test['id']

In [17]:
Xd_te = [word_tokenize(i) for i in Xtest]

In [18]:
Xc_te = [clean(comment) for comment in Xd_te]


In [19]:
Xf_te = [' '.join(comment) for comment in Xc_te]

# Building a model

In [20]:
vec = CountVectorizer()
Xf_tr_s = vec.fit_transform(Xf_tr)

In [21]:
Xf_te_s = vec.transform(Xf_te) 

In [22]:
clf = MultinomialNB()
clf.fit(Xf_tr_s, Ytrain)

In [23]:
clf.score(Xf_tr_s, Ytrain)

0.9185603572835939

In [24]:
Ypred = clf.predict(Xf_te_s)

In [25]:
Ypred

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [26]:
df_pred = pd.DataFrame({'id':ids, 'target':Ypred})

In [27]:
df_pred

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [28]:
df_pred.to_csv('Disaster_tweets_prediction.csv', index = False)