In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from collections import  Counter
import seaborn as sns
from sklearn.utils import shuffle

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
import re
from wordcloud import WordCloud

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
import math
import string
from sklearn.metrics import confusion_matrix

wordnet_lemmatizer = WordNetLemmatizer()

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')
submission_label = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [None]:
print('Trainning Set Shape={}'.format(train.shape))
print('Testing Set Shape={}'.format(test.shape))

## Accuracy check

In [None]:
train['target'].value_counts(normalize=True) #proportion of disaster and non disaster tweets

In [None]:
train.isnull().sum() * 100 / len(train) #Precentage of null values in train set

In [None]:
test.isnull().sum() * 100 / len(test) #Precentage of null values in test set

## Distribution

In [None]:
x=train.target.value_counts()
sns.barplot(x.index,x)
plt.gca().set_ylabel('samples')

# Text Cleaning
#### This dataset requires a lot of cleaning. Have to remove Punctuations, Urls, Special Characters, stoping words etc.

## Creating corpus 

In [None]:
def create_corpus(target):
    """
    This function is returning a list of words from the text which belongs to particular target value(0 or 1)
    """
    corpus=[]
    
    for doc in train[train['target']==target]['text'].str.split():
        for word in doc:
            word = word.lower()
            corpus.append(word)
    return corpus

### Let's look at common stopwords and their frequencies

In [None]:
stop=set(stopwords.words('english')) # storing stopwords of english
#Getting a corpus for non disaster text & creating a dictionary where key is word and its value is count of that word
corpus=create_corpus(0)
dic=defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word]+=1
        
top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] #Sorting keys according to values

In [None]:
x,y=zip(*top)
plt.bar(x,y)
plt.xlabel('Non Disaster Stop Words')  
plt.ylabel('Count') 
plt.title("Count Vs Stop Words")

In [None]:
#Getting a corpus for disaster text & creating a dictionary where key is word and its value is count of that word
corpus=create_corpus(1)
dic=defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word]+=1
        
top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] #Sorting keys according to values

In [None]:
x,y=zip(*top)
plt.bar(x,y)
plt.bar(x,y)
plt.xlabel('Disaster Stop Words')  
plt.ylabel('Count') 
plt.title("Count Vs Stop Words")

## Most Common words

In [None]:
#Top 20 Common words for target = 0 (No disaster)
corpus=create_corpus(0)
start = 0
counter=Counter(corpus)
most=counter.most_common()
x=[]
y=[]
for word,count in most[:]:
    if start == 20:
        break
    if (word not in stop) :
        start += 1
        x.append(word)
        y.append(count)
sns.barplot(x=y,y=x)
plt.xlabel('Count')  
plt.ylabel('words') 
plt.title("Words vs Count")

In [None]:
#Top 20 Common words for target = 1 (disaster)
corpus=create_corpus(1)
start = 0
counter=Counter(corpus)
most=counter.most_common()
x=[]
y=[]
for word,count in most[:]:
    if start == 20:
        break
    if (word not in stop) :
        start += 1
        x.append(word)
        y.append(count)
sns.barplot(x=y,y=x)
plt.xlabel('Count')  
plt.ylabel('words') 
plt.title("Words vs Count")

In [None]:
tweet = pd.concat([train, test], sort=False) #Combining train and test set for cleaning purpose

In [None]:
#Removing URL
def remove_URL(text):
    """
       Replacing url with empty string 
    """
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

#Remove Html Tags
def remove_html(text):

    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

#Removing Emojis
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

#Removing punctuations
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

#Removing stop words
def remove_stopwords(text):
    text = text.lower()
    text_tokens = nltk.tokenize.word_tokenize(text)
    text_tokens = [t for t in text_tokens if len(t.strip()) > 2] # remove short words, they're probably not useful
    text_tokens = [wordnet_lemmatizer.lemmatize(t) for t in text_tokens] # put words into base form
    token = [word for word in text_tokens if not word in stop] # remove stop words
    token = ' '.join(token)
    return token

In [None]:
tweet['text']=tweet['text'].apply(lambda x : remove_URL(x))
tweet['text']=tweet['text'].apply(lambda x : remove_html(x))
tweet['text']=tweet['text'].apply(lambda x : remove_emoji(x))
tweet['text']=tweet['text'].apply(lambda x : remove_punct(x))
tweet['text']=tweet['text'].apply(lambda x: remove_stopwords(x))

## Word Index Map

In [None]:
current_index = 0
word_index_map = {} # dictionary to store each word and its index

In [None]:
def wordIndexMap(text, current_index):
    tokens = nltk.tokenize.word_tokenize(text) # convert string into token
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index #Assigning unique index to unique word
            current_index += 1
    return current_index

In [None]:
text_list = tweet['text'].tolist()
for each_text in text_list:
    current_index = wordIndexMap(each_text, current_index)

In [None]:
len(word_index_map) # There are total 20625 unique words in our data

# Converting words to vector

### Convert Text to tokens

In [None]:
def to_tokens(text):
    """
    Converting each sentence to token
    """
    text_tokens = nltk.tokenize.word_tokenize(text)
    return text_tokens

In [None]:
tweet['text']=tweet['text'].apply(lambda x: to_tokens(x))

In [None]:
def words_to_vector(tokens, i):
    """
    1> Creating a vector of lenght equals to word_index_map
    2> calculating term frequency(count of a word in a particular tweet/total no of word in a particular tweet ) for each of the words present in a tweet
    3> x is a vector form of a word
    """
    x = np.zeros(len(word_index_map))
    for t in tokens:
        if t not in word_index_map:
            continue
        i = word_index_map[t]
        x[i] += 1
    if x.sum() == 0:
        '''If tweet become NA after removing all the unnecessary words'''
        return x
    x = x / x.sum()
    return x

In [None]:
data = np.zeros((len(tweet), len(word_index_map))) # This data matrix will be used a input for our model
for i in range(0, len(tweet)):
    data[i,:] = words_to_vector(tweet.iloc[i]['text'], i) # getting a numeric vector for each tweet

In [None]:
Xtrain = data[:7613,] #Train data for tweets
Xtest = data[7613:,] #Test data for tweets
Ytrain = np.array(tweet.iloc[:7613]['target']) #Train data labels(target values)

# Model Building

In [None]:
#Creating a object of logistic regression
model = LogisticRegression()
#Fitting train data into the model
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))

In [None]:
print("Test accuracy:", model.score(Xtest, submission_label.iloc[:]['target']))

In [None]:
submission_label['target'] = model.predict(Xtest)

In [None]:

submission_label['target']=submission_label['target'].apply(lambda x : int(x))

In [None]:
submission_label.to_csv('submission.csv',index=False)