In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.graph_objects as go
from plotly.offline import plot

from wordcloud import WordCloud, STOPWORDS
import keras_tuner as kt

from tqdm import tqdm
import gc
import re
import string
import operator
from collections import defaultdict

from keras.models import Sequential
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

from keras.layers import Dense
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

In [None]:
import nltk
nltk.download('stopwords')

In [None]:

train = pd.read_csv('../input/yelp-review-dataset/yelp_review_polarity_csv/train.csv')
test =  pd.read_csv('../input/yelp-review-dataset/yelp_review_polarity_csv/train.csv')


print(f'train shape => {train.shape}')
print(f'test shape => {test.shape}')

In [None]:
train.head(5)

In [None]:
test.head(5)

In [None]:
train.info()

In [None]:
train.columns = ["rating","text"]

In [None]:
test.columns = ["rating","text"]

In [None]:
# Check the target value

train["rating"].value_counts()

# Missing Value

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

After impute the missing value check is there any missing value

In [None]:
# Copy dataset

df_train = train.copy()
df_test = test.copy()

In [None]:
df_train["rating"] = df_train["rating"].replace(2,0)

In [None]:
df_train["rating"].value_counts()

# EDA

In [None]:
# word_count
df_train['word_count'] = df_train['text'].apply(lambda x: len(str(x).split()))
df_test['word_count'] = df_test['text'].apply(lambda x: len(str(x).split()))

# unique_word_count
df_train['unique_word_count'] = df_train['text'].apply(lambda x: len(set(str(x).split())))
df_test['unique_word_count'] = df_test['text'].apply(lambda x: len(set(str(x).split())))

# stop_word_count
df_train['stop_word_count'] = df_train['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))
df_test['stop_word_count'] = df_test['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))

# url_count
df_train['url_count'] = df_train['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))
df_test['url_count'] = df_test['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))

# mean_word_length
df_train['mean_word_length'] = df_train['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
df_test['mean_word_length'] = df_test['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

# char_count
df_train['char_count'] = df_train['text'].apply(lambda x: len(str(x)))
df_test['char_count'] = df_test['text'].apply(lambda x: len(str(x)))

# punctuation_count
df_train['punctuation_count'] = df_train['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
df_test['punctuation_count'] = df_test['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

# hashtag_count
df_train['hashtag_count'] = df_train['text'].apply(lambda x: len([c for c in str(x) if c == '#']))
df_test['hashtag_count'] = df_test['text'].apply(lambda x: len([c for c in str(x) if c == '#']))

# mention_count
df_train['mention_count'] = df_train['text'].apply(lambda x: len([c for c in str(x) if c == '@']))
df_test['mention_count'] = df_test['text'].apply(lambda x: len([c for c in str(x) if c == '@']))


In [None]:
# Check the target value

sns.countplot(df_train.rating);
plt.ylabel('Samples');

In [None]:
df_train

In [None]:
all_words = df_train['text'].str.split(expand=True).unstack().value_counts()
data = [go.Bar(
            x = all_words.index.values[2:50],
            y = all_words.values[2:50],
            marker= dict(colorscale='Jet',
                         color = all_words.values[2:100]
                        ),
            text='Word counts'
    )]

layout = go.Layout(
    title='Top 50 (Uncleaned) Word frequencies in the training dataset'
)

fig = go.Figure(data=data, layout=layout)

fig.show()

In [None]:
word_cloud0 = WordCloud(collocations=False, background_color='white').generate(' '.join(df_train['text'][df_train['rating'] == 0]))
plt.imshow(word_cloud0, interpolation='bilinear')
plt.title('Non-Disaster Wordcloud (0)')
plt.axis('off')
plt.show()

In [None]:
word_cloud1 = WordCloud(collocations=False, background_color='white').generate(' '.join(df_train['text'][df_train['rating'] == 1]))
plt.imshow(word_cloud1, interpolation='bilinear')
plt.title('Non-Disaster Wordcloud (1)')
plt.axis('off')
plt.show()

In [None]:
df_train.describe(include=['O'])

In [None]:
df_test.describe(include=['O'])

# Examples

In [None]:
df_train.query('target == 0').text.values[0]

In [None]:
df_train.query('target == 1').text.values[0]

In [None]:
sns.barplot(y=df_train['location'].value_counts()[:10].index,
            x=df_train['location'].value_counts()[:10]);

In [None]:
df_train['text'] = df_train['text'].str.lower()
df_test['text'] = df_test['text'].str.lower()

# Preprocessing

Remove URLs

In [None]:
example="New competition launched :https://www.kaggle.com/c/nlp-getting-started"

In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

remove_URL(example)


# Apply on train & test dataset

df_train['text']=df_train['text'].apply(lambda x : remove_URL(x))
df_test['text']=df_test['text'].apply(lambda x : remove_URL(x))

Remove HTML tags

In [None]:
example = """<div>
<h1>Real or Fake</h1>
<p>Kaggle </p>
<a href="https://www.kaggle.com/c/nlp-getting-started">getting started</a>
</div>"""

In [None]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)
    
print(remove_html(example))


# Apply on train & test dataset

df_train['text']= df_train['text'].apply(lambda x : remove_html(x))
df_test['text']= df_test['text'].apply(lambda x : remove_html(x))


In [None]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji.sub(r'', text)

remove_emoji("Omg another Earthquake 😔😔")


# Apply on train & test dataset

df_train['text']= df_train['text'].apply(lambda x : remove_emoji(x))
df_test['text']= df_test['text'].apply(lambda x : remove_emoji(x))

Remove Number

In [None]:
df_train['text']= df_train['text'].str.replace('\d+', '')
df_test['text']= df_test['text'].str.replace('\d+', '')

In [None]:
# https://stackoverflow.com/a/47091490/4084039
import re


def cleaner(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    
    
    return phrase

In [None]:
df_train['text'] = df_train['text'].apply(lambda s : cleaner(s))
df_test['text'] = df_test['text'].apply(lambda s : cleaner(s))

Remove Punctuation

In [None]:
example= "Our Deeds are the Reason of this #earthquake"

In [None]:
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

print(remove_punct(example))

In [None]:

df_train['text']= df_train['text'].apply(lambda x : remove_punct(x))
df_test['text']= df_test['text'].apply(lambda x : remove_punct(x))

# Tokenization

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
df_train_token = df_train.copy()

In [None]:
df_train_token['parsed'] = df_train_token.text.apply(nlp)

In [None]:
df_train_token.head(5)

In [None]:
print("Document Count")
print(df_train_token.groupby('keyword')['text'].count())
print("Word Count")
df_train.groupby('keyword').apply(lambda x: x.text.apply(lambda x: len(x.split())).sum())

In [None]:
# Sum the number of tokens in each Doc
df_train_token['num_tokens'] = [len(token) for token in df_train_token.parsed]
# Visualize histogram of tokens per tweet
g = sns.distplot(df_train_token.num_tokens)

In [None]:
df_train_token.head(5)

In [None]:
# Define X & y

X = df_train['text']
y = df_train['target']

In [None]:
# Split the dataset

X_train, X_test, y_train, y_test  = train_test_split(X, y,test_size = 0.2, random_state= 42, shuffle=False)

# Padding

In [None]:
num_words = 15000
dim = 200
max_len = 32
tok = Tokenizer(num_words=num_words)
# Adding 1 because of reserved 0 index
vocab_size = len(tok.word_index) + 1
tok.fit_on_texts(X_train)


In [None]:
X_train_new = tok.texts_to_sequences(X_train)
X_test_new = tok.texts_to_sequences(X_test)


In [None]:
#length_long_sentence  = 1000
X_train_pad = sequence.pad_sequences(X_train_new, maxlen=max_len)
X_test_pad = sequence.pad_sequences(X_test_new, maxlen=max_len)

X_train_pad = np.array(X_train_pad)

X_test_pad = np.array(X_test_pad)



print(X_train_pad.shape)
print(X_train_pad[1])

# Embedding

In [None]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
embeddings_dictionary = dict()
vocab_size = len(tok.word_index) + 1
embedding_dim = 100

glove_file = open('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt')

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions

glove_file.close()

In [None]:
# create a weight matrix for words in training set
embedding_matrix = np.zeros((vocab_size, 100))

for word, i in tok.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
print('Embedding Matrix Shape:', embedding_matrix.shape)

In [None]:
# Adding 1 because of reserved 0 index
vocab_size = len(tok.word_index) + 1

# load the whole embedding into memory
embeddings_index = dict()
f = open('../input/glove-global-vectors-for-word-representation/glove.6B.200d.txt')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

# Plot

In [None]:
def plt_dynamic(x, vy, ty, ax, colors=['b']):
    ax.plot(x, vy, 'b', label="Validation Loss")
    ax.plot(x, ty, 'r', label="Train Loss")
    plt.legend()
    plt.grid()
    fig.canvas.draw()
    
n_epochs = 10
batchsize = 512

final_output = pd.DataFrame(columns=["Model", "Architecture",
                                     "TRAIN_LOSS", "TEST_LOSS", "TRAIN_ACC", "TEST_ACC"]);

# Model

In [None]:
from keras.layers import (LSTM, 
                          Embedding, 
                          BatchNormalization,
                          Dense, 
                          TimeDistributed, 
                          Dropout, 
                          Bidirectional,
                          Flatten, 
                          GlobalMaxPool1D)

In [None]:
model=Sequential()

embedding = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_len, trainable=False)

model.add(embedding)
model.add(Bidirectional(LSTM(256, dropout=0.25, recurrent_dropout=0.2)))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())



In [None]:

stop_early = EarlyStopping(monitor='val_loss', mode='min',
                           verbose=1, patience=12)



In [None]:


history = model.fit(X_train_pad, y_train, epochs=20,
                                  validation_data=(X_test_pad, y_test),
                                  callbacks=[stop_early])

# Model Evaluation

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
from sklearn import metrics

y_pred = model.predict(X_test_pad)
y_pred = y_pred.round().astype('int')

print(metrics.accuracy_score(y_test,y_pred))
print(metrics.confusion_matrix(y_test,y_pred))
print(metrics.classification_report(y_test,y_pred))

In [None]:
score,acc = model.evaluate(X_test_pad, y_test, verbose = 2, batch_size = batchsize)
print("Score: %.2f" % (score))
print("Validation Accuracy: %.2f" % (acc))

# Test Dataset

In [None]:
test_sentences = df_test.text.to_numpy()

In [None]:
test_sentences = tok.texts_to_sequences(test_sentences)

In [None]:
test_padded = sequence.pad_sequences(test_sentences, maxlen=max_len)

In [None]:
predictions_test = model.predict(test_padded)
predictions_test = [1 if p > 0.5 else 0 for p in predictions_test]

In [None]:
predictions_test[:5]

In [None]:
class_pred= np.array(predictions_test)
class_pred[:5]

In [None]:
df = pd.DataFrame()
df['id'] = df_test['id']
df['target'] = class_pred.astype(int)

df.to_csv('submission.csv', index=False)
df

# Evaluation

In [None]:
from sklearn.metrics import roc_curve, auc


nn_fpr_keras, nn_tpr_keras, nn_thresholds_keras = roc_curve(y_test,class_pred)
auc_keras = auc(nn_fpr_keras, nn_tpr_keras)
plt.plot(nn_fpr_keras, nn_tpr_keras, marker='.', label='Neural Network (auc = %0.3f)' % auc_keras)