mount and imports

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import re
import pprint
import gensim
import logging
import pickle
# SKLEARN
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
# NLTK
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
# KERAS
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation,Dense,Dropout,Embedding,GRU,LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.models import load_model,model_from_json

Content
* load data
* preprocess
* tokenize
* padding
* lstm
* predict

# DATA: AMAZON FINE FOOD
active

In [0]:
import sqlite3

In [0]:
!pip install -q kaggle

In [0]:
# The Kaggle API client expects this file to be in ~/.kaggle,
# so move it there.
!mkdir -p ~/.kaggle
!cp '/content/drive/My Drive/Colab Notebooks/kaggle.json' ~/.kaggle/

# This permissions change avoids a warning on Kaggle tool startup.
!chmod 600 ~/.kaggle/kaggle.json

In [0]:
!kaggle datasets download -d snap/amazon-fine-food-reviews --unzip

In [0]:
con = sqlite3.connect('/content/database.sqlite')
df = pd.read_sql_query("SELECT * FROM Reviews", con)

In [0]:
df.drop(['Id','ProductId','UserId','ProfileName','Time'],axis=1,inplace=True)
df.head()

In [0]:
df['Usefulness'] = (df['HelpfulnessNumerator'] / df['HelpfulnessDenominator']).apply(lambda x: 'useful' if x>0.7 else 'useless')

In [0]:
df['sentiment'] = df['Score'].apply(lambda x: 'positive' if x>3 else ('negative' if x<3 else 'neutral'))

In [0]:
df.head()

In [0]:
df.columns = ['upvote','totalvote','score','summary','content','usefulness','sentiment']

In [0]:
df

In [0]:
def describe(amzn):
    emotions = amzn.sentiment.unique()
    emotions_details = []
    for emo in emotions:
        info = {}
        info['feeling'] = emo
        info['count'] = len(amzn[amzn['sentiment']==emo])
        info['mean_len'] = np.mean([len(x.split()) for x in amzn[amzn['sentiment']==emo]['content']])
        emotions_details.append(info)
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(emotions_details)
describe(df)

since dataset is more biased towards positive reviews, let's randomly select 82037 positive, 82037 negative, 42640 neutral

In [0]:
pos_rev = df[df['sentiment']=='positive'].sample(n=82037,axis=0)

In [0]:
amzn = df.drop(df[df.sentiment == 'positive'].index)

In [0]:
amzn = amzn.append(pos_rev)

In [0]:
describe(amzn)

voila!

# Preprocessing
skip for amzn, preprocessed *csv* is saved

In [0]:
stopword = stopwords.words('english')

In [0]:
print(stopword)

In [0]:
def clean(text):
    """Remove link,user, special characters and stopwords
    params: string.
    returns: string
    """
    stemmer = SnowballStemmer(language='english')
    text = re.sub('@\S+|https?:\S+|http?:\S|\W+', ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stopword:
            token = stemmer.stem(token)
            tokens.append(token)

    return " ".join(tokens)

In [0]:
%%time
amzn.content = amzn.content.apply(lambda x: clean(x)) # x = each tweet

In [0]:
amzn.head()

saving amzn

In [0]:
amzn.to_csv('/content/drive/My Drive/Colab Notebooks/AI_LAB_PROJECT/Data/amzn.csv')

**Load AMZN** \
it is already processed

In [0]:
amzn = pd.read_csv('/content/drive/My Drive/Colab Notebooks/AI_LAB_PROJECT/Data/amzn.csv')
amzn.head()

train test split

In [0]:
train,test = train_test_split(amzn,test_size=0.2)

# Keras

In [0]:
SENT_LEN = 100

Tokenization and Padding

In [0]:
tk = Tokenizer(num_words=50000)
tk.fit_on_texts(amzn.content)

In [0]:
MAX_WORDS = len(tk.word_index)+1
print(len(tk.word_index)+1)

In [0]:
X_train = tk.texts_to_sequences(train.content)
X_train = pad_sequences(X_train,maxlen=SENT_LEN,padding='post')

In [0]:
X_train.shape

In [0]:
X_test = tk.texts_to_sequences(test.content)
X_test = pad_sequences(X_test,maxlen=SENT_LEN)

One Hot Encoding: Sentiment

In [0]:
y = np.array(amzn.sentiment).reshape(-1,1)
y_train = np.array(train.sentiment).reshape(-1,1)
y_test = np.array(test.sentiment).reshape(-1,1)

In [0]:
onehotenc = OneHotEncoder(categories="auto",handle_unknown='ignore')
onehotenc.fit(y)
y_train = onehotenc.transform(y_train)
y_test = onehotenc.transform(y_test)

In [0]:
with open('/content/drive/My Drive/Colab Notebooks/AI_LAB_PROJECT/Data/ohe.pickle', 'wb') as handle:
    pickle.dump(onehotenc, handle, protocol=2)

In [0]:
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

 defining model

In [0]:
model = Sequential()
model.add(Embedding(MAX_WORDS,100,input_length=X_train.shape[1])) # PROBLEM
model.add(LSTM(100,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(y_test.shape[1],activation='softmax'))
model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])
model.summary()

In [0]:
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]

fit

In [0]:
%%time
lstm_3= model.fit(X_train,y_train,batch_size=128,epochs=5,validation_data=(X_test,y_test),verbose=1,callbacks=callbacks)

saving trained model

In [0]:
model.save('/content/drive/My Drive/Colab Notebooks/AI_LAB_PROJECT/Data/lstm_3_amzn.h5')

In [0]:
plt.title('Accuracy')
plt.plot(lstm_3.history['acc'], label='train')
plt.plot(lstm_3.history['val_acc'], label='test')
plt.legend()
plt.show()

In [0]:
def predict(text):
    """
    returns:
    pr = [negative,neutral,positive]
    f_label = sentiment label
    s_label
    """
    # Tokenize text
    x_test = tk.texts_to_sequences([text])
    x_test = pad_sequences(x_test, maxlen=SENT_LEN)
    # Predict
    pr = model.predict([x_test])
    # taking top two of final predictions
    f_score = np.max(pr)
    f_label = onehotenc.inverse_transform(pr)
    return (pr,f_label,s_label)

predicting samples

In [0]:
p = "Every once in a while, especially after a longer cook time with liquids still present, some leaking will happen from bag into the slow cooker. But at least it is minimal and cleanup is still super easy!"
n = "These were horrible. bought them for my crockpot. every time i used them (on low heat) they would burn onto the crockpot and it would rip as i was pulling it out. complete mess! do not buy these"
print(predict(p))
print(predict(n))

exporting lstm `model` as HDF5 file, `tokenizer` as .pickle, `onehotenc` as .pickle file

In [0]:
import pickle
with open('/content/drive/My Drive/Colab Notebooks/AI_LAB_PROJECT/Data/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tk, handle, protocol=2)

# Appendix

save weight + architecture

```
model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
del model  # deletes the existing model

# returns a compiled model
# identical to the previous one
model = load_model('my_model.h5')
```

Save only architecture
```
# save as JSON
json_string = model.to_json()

# save as YAML
yaml_string = model.to_yaml()
```
Save JSON 
```
import json
my_details = {
    'name': 'John Doe',
    'age': 29
}
with open('personal.json', 'w') as json_file:
    json.dump(my_details, json_file)
```
Load
```
# model reconstruction from JSON:
from keras.models import model_from_json
model = model_from_json(json.dumps(json_string)

# model reconstruction from YAML:
from keras.models import model_from_yaml
model = model_from_yaml(yaml_string)
```