In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load the data and one random example from the training dataset
df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
df = df[['text','target']]
print (df.isna().sum())
import seaborn as sns
sns.countplot("target",data=df);

### Data cleaning

In [None]:
replace_list = {r"i'm": 'i am',
                r"'re": ' are',
                r"let’s": 'let us',
                r"'s":  ' is',
                r"'ve": ' have',
                r"can't": 'can not',
                r"cannot": 'can not',
                r"shan’t": 'shall not',
                r"n't": ' not',
                r"'d": ' would',
                r"'ll": ' will',
                r"'scuse": 'excuse',
                ',': ' ,',
                '.': ' .',
                '!': ' !',
                '!!': ' !',
                '!!!': ' !',
                '?': ' ?',
                '??': ' ?',
                '???': ' ?',
                '\s+': ' '}

def more_preprocess (x):
    processed_feature = re.sub(r'\W', ' ', str(x))

    # Remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()
    
    for s in replace_list:
        processed_feature = processed_feature.replace(s, replace_list[s])
    #text = ' '.join(text.split())
    
    return processed_feature

df['text'] = df['text'].apply(more_preprocess)
df.sample()

### Tokenizer

In [None]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(df['text'].values)
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X)
print (X.shape)

### Model

In [None]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

### Validation

In [None]:
Y = pd.get_dummies(df['target']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

### Training

In [None]:
batch_size = 128
model.fit(X_train, Y_train, epochs = 20, batch_size=batch_size, verbose = 1, validation_data=(X_test,Y_test))

In [None]:
# Let's obtain our predictions on our test dataset
predictions = model.predict(X_test)
preds = [np.argmax(y) for y in predictions]


In [None]:
# and the true predictions as vector
y_test = [np.argmax(y) for y in Y_test]

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,preds))
print(classification_report(y_test,preds))
print("Accuracy {0:.2f}%".format(100*accuracy_score(y_test, preds)))

In [None]:
twt = ['severe drought']
#vectorizing the tweet by the pre-fitted tokenizer instance
twt = tokenizer.texts_to_sequences(twt)
#padding the tweet to have exactly the same shape as `embedding_2` input
twt = pad_sequences(twt, maxlen=X.shape[1], dtype='int32', value=0)
print(twt)
sentiment = model.predict(twt,batch_size=1,verbose = 2)[0]
if(np.argmax(sentiment) == 0):
    print("no disaster")
elif (np.argmax(sentiment) == 1):
    print("disaster")

### Submission

In [None]:
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv",usecols=['text','id'])
test['text'] = test['text'].apply(more_preprocess)

#vectorizing the tweet by the pre-fitted tokenizer instance


X_submission = tokenizer.texts_to_sequences(test['text'].values)
maxlen = 27 # = X.shape[1]

X_submission = pad_sequences(X_submission, maxlen=maxlen)
print (X_submission.shape)
predictions = model.predict(X_submission)
preds = [np.argmax(pred) for pred in predictions]
submission=pd.DataFrame()
submission['id']=test['id']
submission['target'] = preds
submission.sample()
submission.to_csv('/kaggle/working/submission.csv', index=False)