### Starter code to extract data from .tgz file to begin with EDA, buliding NLP models

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tarfile # this is to extract the data from that .tgz file
import string
import tensorflow as tf
from keras.models import Sequential
from keras.models import Model
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
# get all of the data out of that .tgz
amazon_reviews = tarfile.open('/kaggle/input/amazon-reviews/amazon_review_polarity_csv.tgz')
amazon_reviews.extractall('data')
amazon_reviews.close()

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import os
for dirname, _, filenames in os.walk('./'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# check out what the data looks like before you get started
# look at the training data set
train_df = pd.read_csv('./data/amazon_review_polarity_csv/train.csv', header=None)

In [None]:
# look at the test data set
test_df = pd.read_csv('./data/amazon_review_polarity_csv/test.csv', header=None)

In [None]:
train_len=50000
test_len=5000

train_df = pd.concat([train_df.loc[train_df[0] == 1].sample(train_len//2, random_state=42),
                      train_df.loc[train_df[0] == 2].sample(train_len//2, random_state=42)]).reset_index(drop=True)
test_df = pd.concat([test_df.loc[test_df[0] == 1].sample(test_len//2, random_state=42),
                      test_df.loc[test_df[0] == 2].sample(test_len//2, random_state=42)]).reset_index(drop=True)

In [None]:
train_df.rename(columns = {0:'class'}, inplace = True)
train_df.rename(columns = {1:'title'}, inplace = True)
train_df.rename(columns = {2:'review'}, inplace = True)

In [None]:
test_df.rename(columns = {0:'class'}, inplace = True)
test_df.rename(columns = {1:'title'}, inplace = True)
test_df.rename(columns = {2:'review'}, inplace = True)

In [None]:
train_df['text']=train_df['title']+' '+train_df['review']
test_df['text']=test_df['title']+' '+test_df['review']

In [None]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
stop_words = set(stopwords.words('english'))
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

In [None]:
def clean_text_data(text):
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

In [None]:
train_df['review'] = train_df['review'].apply(clean_text_data)
test_df['review'] = test_df['review'].apply(clean_text_data)

In [None]:
max_words = 50000
max_len = 250
tokenizer = Tokenizer(num_words=max_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(train_df['review'].values)
word_index = tokenizer.word_index

In [None]:
def tokenize_data(data, max_len):
    X = tokenizer.texts_to_sequences(data.values)
    X = pad_sequences(X,maxlen=max_len)
    return X

In [None]:
X_train = tokenize_data(train_df['review'], max_len)
X_test = tokenize_data(test_df['review'], max_len)

Y_train = pd.get_dummies(train_df['class']).values
Y_test = pd.get_dummies(test_df['class']).values


In [None]:
embeddings_index = {}
f = open('../input/glove6b100dtxt/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [None]:
EMBEDDING_DIM=100
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
def model(max_features, shape):
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=shape,
                            trainable=False))
    model.add(Dropout(0.1))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(64,activation='relu'))
    model.add(Dropout(0.2))
#model.add(Dense(32,activation='relu'))
    model.add(Dense(2,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    print(model.summary())
    return model

In [None]:
model = model(max_words, X_train.shape[1])

In [None]:
batch_size = 64
history = model.fit(X_train, Y_train, epochs = 1, batch_size=batch_size, verbose = 1, validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [None]:
res = model.evaluate(X_test, Y_test, verbose = 2)

In [None]:
model.predict(tokenize_data(pd.Series(["Amazing"]), max_len))