In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Imports

In [None]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.text import Tokenizer  
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import metrics
from tensorflow.keras import optimizers
from tensorflow.keras.utils import plot_model

import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from collections import Counter
from pathlib import Path
import os
import numpy as np
import re
import string
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import wordnet
import unicodedata
import html
stop_words = stopwords.words('english')

## Load Data

In [None]:
train_df = pd.read_csv("../input/nlp-getting-started/train.csv")
test_df = pd.read_csv("../input/nlp-getting-started/test.csv")

In [None]:
train_df[train_df["target"] == 0]["text"].values[10]

In [None]:
train_df[train_df["target"] == 1]["text"].values[10]

## 1. Perform proper Text preprocessing steps

* Data cleaning :
   *    Remove special characters and punctuations

In [None]:
def subtext_repeation_in_df(df, col, subtext, num):
    # Calc statistics as table for character repetition (1...num times) from subtext list in the df[col]
    
    text = "".join(df[col])
    result = pd.DataFrame(columns = ['subtext', 'count'])
    i = 0
    if (len(df) > 0) and (len(subtext) > 0):
        for c in subtext:
            for j in range(num):
                cs = c*(j+1)
                result.loc[i,'count'] = text.count(cs)
                if c == ' ':
                    cs = cs.replace(' ','<space>')
                result.loc[i,'subtext'] = cs                
                i += 1
    print('Number of all data is', len(df))
    result = result[result['count'] > 0].reset_index(drop=True)
    display(result.sort_values(by='count',ascending=False))
    
    print('Text examples')
    problem_examples = pd.DataFrame(columns = ['problem_examples'])
    problem_examples['problem_examples'] = ''
    for i in range(len(result)):
        problem_examples.loc[i,'problem_examples'] = df[df[col].str.find(result.loc[i,'subtext'])>-1].reset_index(drop=True).loc[0, col]
    problem_examples = problem_examples.drop_duplicates()
    display(problem_examples)

In [None]:
# Analysis of punctuation marks repetition in training data
print('Statistics for punctuation marks repetition in training data')
subtext_repeation_in_df(train_df, 'text', list(string.punctuation), 10)

In [None]:
def remove_punctuation(text):
    """Remove punctuation from list of tokenized words"""
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

In [None]:
train_df['text'].iloc[0]

In [None]:
remove_punctuation(train_df['text'].iloc[0])

* Case normalization

In [None]:
def to_lowercase(text):
    return text.lower()

In [None]:
to_lowercase(remove_punctuation(train_df['text'].iloc[0]))

* Replace all Numbers

In [None]:
def replace_numbers(text):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    return re.sub(r'\d+', '', text)

In [None]:
train_df['text'].iloc[95]

In [None]:
replace_numbers(to_lowercase(remove_punctuation(train_df['text'].iloc[95])))

* Words tokenization : create list of words

In [None]:
def text2words(text):
    return word_tokenize(text)

In [None]:
text2words(replace_numbers(to_lowercase(remove_punctuation(train_df['text'].iloc[95]))))

* Remove Stop words like ( 'the', 'to', 'on', 'we',...etc)

In [None]:
def remove_stopwords(words, stop_words):
    """
    :param words:
    :type words:
    :param stop_words: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
    or
    from spacy.lang.en.stop_words import STOP_WORDS
    :type stop_words:
    :return:
    :rtype:
    """
    return [word for word in words if word not in stop_words]

In [None]:
t= text2words(replace_numbers(to_lowercase(remove_punctuation(train_df['text'].iloc[95]))))
remove_stopwords( t, stop_words)

* Stemming

In [None]:
def stem_words(words):
    """Stem words in text"""
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in words]

In [None]:
t= remove_stopwords(text2words(replace_numbers(to_lowercase(remove_punctuation(train_df['text'].iloc[95])))),stop_words)
stem_words(t)

* Another preprocess : Lemmatizing

In [None]:
def lemmatize_words(words):
    """Lemmatize words in text"""

    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]

In [None]:
t= remove_stopwords(text2words(replace_numbers(to_lowercase(remove_punctuation(train_df['text'].iloc[95])))),stop_words)
lemmatize_words(t)

In [None]:
def lemmatize_verbs(words):
    """Lemmatize verbs in text"""

    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word, pos='v') for word in words])


## Split data 

In [None]:
train_ds_text = train_df.drop(["id","keyword","location"], axis=1)
test_ds_text = test_df.drop(["id","keyword","location"], axis=1)
training_messages = []
test_messages = []
training_labels = []
for index, item in train_ds_text.iterrows():
    message, label = item["text"], item["target"]
    training_messages.append(str(message))
    training_labels.append(label)
    
    
for index, item in test_ds_text.iterrows():
    message = item["text"]
    test_messages.append(str(message))

In [None]:
def remove_special_chars(text):
    re1 = re.compile(r'  +')
    x1 = text.lower().replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
        ' @-@ ', '-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x1))

## 3. Text preparation

In [None]:
def normalize_text( text):
    text = remove_special_chars(text)
    text = remove_punctuation(text)
    text = to_lowercase(text)
    text = replace_numbers(text)
    words = text2words(text)
    words = remove_stopwords(words, stop_words)
    #words = stem_words(words)# Either stem ovocar lemmatize
    words = lemmatize_words(words)
    words = lemmatize_verbs(words)
    

    return ''.join(words)

def normalize_corpus(corpus):
    return [normalize_text(t) for t in corpus]

In [None]:
trn_texts = normalize_corpus(training_messages)
tst_texts = normalize_corpus(test_messages)

### * BoW (Binary features)

In [None]:
tok = Tokenizer(num_words=1000, oov_token='UNK')
#tok = Tokenizer(oov_token='UNK')
tok.fit_on_texts(trn_texts + tst_texts)
# Extract binary BoW features
x_train = tok.texts_to_matrix(trn_texts, mode='binary')
x_test = tok.texts_to_matrix(tst_texts, mode='binary')

In [None]:
y_train = np.asarray(training_labels).astype('float32')

In [None]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)

## 4.Bulid Model

In [None]:
x_val = x_train[:1000]
partial_x_train = x_train[1000:]

y_val = y_train[:1000]
partial_y_train = y_train[1000:]

model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(1000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=optimizers.RMSprop(lr=0.001),
              loss=losses.binary_crossentropy,
              metrics=[metrics.binary_accuracy])

history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(x_val, y_val))

In [None]:
acc = history.history['binary_accuracy']
val_acc = history.history['val_binary_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:


plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('accuracy')
plt.legend()

plt.show()

## Preparing Submission File

In [None]:
# model = models.Sequential()
# model.add(layers.Dense(16, activation='relu', input_shape=(1000,)))
# model.add(layers.Dense(16, activation='relu'))
# model.add(layers.Dense(16, activation='relu'))
# model.add(layers.Dense(1, activation='sigmoid'))

# model.compile(optimizer=optimizers.RMSprop(lr=0.001),
#               loss=losses.binary_crossentropy,
#               metrics=[metrics.binary_accuracy])

# model.fit(partial_x_train,
#                     partial_y_train,
#                     epochs=4,
#                     batch_size=512,
#                     validation_data=(x_val, y_val))

In [None]:
# result = model.predict(x_test)

In [None]:
# result

In [None]:
# result[result>=0.5]=1
# result[result<0.5]=0

In [None]:
# result

In [None]:
# sample_submission=pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
# sample_submission['target']=result
# sample_submission['target']=sample_submission['target'].astype(int)

In [None]:
#sample_submission.head()
# sample_submission.to_csv('submission.csv', index=False)

In [None]:
### using count
x_train = tok.texts_to_matrix(trn_texts, mode='count')
x_test = tok.texts_to_matrix(tst_texts, mode='count')

x_val = x_train[:1000]
partial_x_train = x_train[1000:]

y_val = y_train[:1000]
partial_y_train = y_train[1000:]

model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(1000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=optimizers.RMSprop(lr=0.001),
              loss=losses.binary_crossentropy,
              metrics=[metrics.binary_accuracy])

history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(x_val, y_val))

In [None]:
acc = history.history['binary_accuracy']
val_acc = history.history['val_binary_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:


plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('accuracy')
plt.legend()
plt.grid()
plt.show()

## Preparing Submission File

In [None]:
# model = models.Sequential()
# model.add(layers.Dense(16, activation='relu', input_shape=(1000,)))
# model.add(layers.Dense(16, activation='relu'))
# model.add(layers.Dense(16, activation='relu'))
# model.add(layers.Dense(1, activation='sigmoid'))

# model.compile(optimizer=optimizers.RMSprop(lr=0.001),
#               loss=losses.binary_crossentropy,
#               metrics=[metrics.binary_accuracy])

# model.fit(partial_x_train,
#                     partial_y_train,
#                     epochs=5,
#                     batch_size=512,
#                     validation_data=(x_val, y_val))

In [None]:
# result = model.predict(x_test)
# result[result>=0.5]=1
# result[result<0.5]=0
# result

In [None]:
# sample_submission=pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
# sample_submission['target']=result
# sample_submission['target']=sample_submission['target'].astype(int)
# #sample_submission.head()
# sample_submission.to_csv('submission.csv', index=False)

In [None]:
### using freq
x_train = tok.texts_to_matrix(trn_texts, mode='freq')
x_test = tok.texts_to_matrix(tst_texts, mode='freq')

x_val = x_train[:1000]
partial_x_train = x_train[1000:]

y_val = y_train[:1000]
partial_y_train = y_train[1000:]

model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(1000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=optimizers.RMSprop(lr=0.001),
              loss=losses.binary_crossentropy,
              metrics=[metrics.binary_accuracy])

history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(x_val, y_val))

In [None]:
acc = history.history['binary_accuracy']
val_acc = history.history['val_binary_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('accuracy')
plt.legend()
plt.grid()
plt.show()

## Preparing Submission File

In [None]:
# model = models.Sequential()
# model.add(layers.Dense(16, activation='relu', input_shape=(1000,)))
# model.add(layers.Dense(16, activation='relu'))
# model.add(layers.Dense(16, activation='relu'))
# model.add(layers.Dense(1, activation='sigmoid'))

# model.compile(optimizer=optimizers.RMSprop(lr=0.001),
#               loss=losses.binary_crossentropy,
#               metrics=[metrics.binary_accuracy])

# model.fit(partial_x_train,
#                     partial_y_train,
#                     epochs=10,
#                     batch_size=512,
#                     validation_data=(x_val, y_val))

In [None]:
# result = model.predict(x_test)
# result[result>=0.5]=1
# result[result<0.5]=0
# sample_submission=pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
# sample_submission['target']=result
# sample_submission['target']=sample_submission['target'].astype(int)
# #sample_submission.head()
# sample_submission.to_csv('submission.csv', index=False)

## Use LSTM 

In [None]:
maxlen = max([len(t) for t in trn_texts])

In [None]:
maxlen

In [None]:
l=[len(t) for t in trn_texts]

In [None]:
plt.hist(l)
plt.grid()

In [None]:
maxlen=80

In [None]:
training_messages2 = tok.texts_to_sequences(trn_texts)

training_padded = pad_sequences(training_messages2,
                                maxlen=maxlen, 
                                truncating='post', 
                                padding='post'
                               )

In [None]:
# #tst_texts
test_messages = tok.texts_to_sequences(tst_texts)

test_padded = pad_sequences(test_messages,
                                maxlen=maxlen, 
                                truncating='post', 
                                padding='post'
                               )

In [None]:
training_padded = np.array(training_padded)
x_val = training_padded[:1000]
partial_x_train = training_padded[1000:]

y_val = y_train[:1000]
partial_y_train = y_train[1000:]

model = models.Sequential()
model.add(layers.Embedding(1000, 20, input_length=maxlen))
model.add(layers.Bidirectional(layers.LSTM(50, dropout=0.2)))
model.add(layers.Flatten())
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(
    loss='binary_crossentropy',
    optimizer='Adamax',
    metrics=[metrics.binary_accuracy]
)

model.summary()

In [None]:
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=35,
                    batch_size=128,
                    validation_data=(x_val, y_val))

In [None]:
acc = history.history['binary_accuracy']
val_acc = history.history['val_binary_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:


plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('accuracy')
plt.legend()
plt.grid()
plt.show()

## Preparing Submission File


In [None]:
model = models.Sequential()
model.add(layers.Embedding(1000, 20, input_length=maxlen))
model.add(layers.Bidirectional(layers.LSTM(50, dropout=0.2)))
model.add(layers.Flatten())
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(
    loss='binary_crossentropy',
    optimizer='Adamax',
    metrics=[metrics.binary_accuracy]
)


model.fit(partial_x_train,
                    partial_y_train,
                    epochs=15,
                    batch_size=128,
                    validation_data=(x_val, y_val))

In [None]:
result = model.predict(test_padded)
result[result>=0.5]=1
result[result<0.5]=0
sample_submission=pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
sample_submission['target']=result
sample_submission['target']=sample_submission['target'].astype(int)
#sample_submission.head()
sample_submission.to_csv('submission.csv', index=False)