# Preprocessing Data



In [53]:
!pip install tensorflow-gpu==2.0.0-beta1
import pandas as pd
import numpy as np




In [0]:
train_data = pd.read_csv("train.tsv", sep="\t")
test_data = pd.read_csv("test.tsv", sep="\t")

In [55]:
print(train_data.head())

   PhraseId  ...  Sentiment
0         1  ...          1
1         2  ...          2
2         3  ...          2
3         4  ...          2
4         5  ...          2

[5 rows x 4 columns]


In [56]:
print(test_data.head())

   PhraseId  SentenceId                                             Phrase
0    156061        8545  An intermittently pleasing but mostly routine ...
1    156062        8545  An intermittently pleasing but mostly routine ...
2    156063        8545                                                 An
3    156064        8545  intermittently pleasing but mostly routine effort
4    156065        8545         intermittently pleasing but mostly routine


In [57]:
print(train_data.shape, test_data.shape)

(156060, 4) (66292, 3)


In [58]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import random
import pickle
from tqdm import tqdm
import re
nltk.download("popular")
nltk.download("punkt")
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

In [59]:
print(stop_words)

{'them', 'above', 'weren', "won't", 'i', 'him', 'few', 'our', 'this', 'before', 'further', 'any', "couldn't", 'because', 'other', 'yourselves', 'its', "isn't", 'again', 'and', 'do', 'then', 'nor', 'here', 'y', 'or', 'it', 'these', 'at', "should've", 'did', 'ourselves', 'down', 'should', "you've", 'were', 'had', "needn't", 'he', 'in', "haven't", "wasn't", 'between', "didn't", 'herself', 'myself', 've', "mustn't", 's', "mightn't", 'me', 'own', 'll', 'being', 'her', 'each', "doesn't", 'against', 'o', 'about', 'wasn', 'ain', 'up', 'will', 'that', 'you', 'd', 'under', 'after', 'below', 'so', 'don', 'a', 'than', 're', 'what', 'there', 'is', 'she', 'off', 'am', 'hadn', 'been', 'through', 'my', 'm', 'aren', 'the', 'more', 'himself', "that'll", 'only', 'now', 'which', 'who', 'didn', "it's", "you're", 'be', 'on', "aren't", 'isn', 'by', 'same', 'until', "you'll", 'for', 'just', 'won', 'over', 'are', 'from', 'but', 'out', 'all', 'during', 'can', 'very', 'with', 'they', 'having', 'as', "shan't", 'w

In [0]:
def process_sentences(df):
    
    sentences = []
    
    for sent in tqdm(df['Phrase']):
        
        #remove non alphanumeric characters
        
        replaced = re.sub(r'[^a-zA-z0-9\s]','', sent.lower())
        
        #tokenize words
        words = word_tokenize(replaced)

        #remove common words
        #filtered_sentence = [w for w in words if w not in stop_words]
        
        #lemmatize words
        #lexicon = filtered_sentence
        lexicon = [lemmatizer.lemmatize(i) for i in words]
        
        sentences.append(lexicon)
        
    return sentences



In [61]:
train_sentences = process_sentences(train_data)

test_sentences = process_sentences(test_data)


100%|██████████| 156060/156060 [00:20<00:00, 7535.52it/s]
100%|██████████| 66292/66292 [00:08<00:00, 7928.17it/s]


In [62]:
print(train_sentences[0])
print(test_sentences[0])

['a', 'series', 'of', 'escapade', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amount', 'to', 'much', 'of', 'a', 'story']
['an', 'intermittently', 'pleasing', 'but', 'mostly', 'routine', 'effort']


In [63]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
print(tf.__version__)

train_y = train_data['Sentiment'].values
print(train_y)

num_classes = max(train_y) + 1
print(num_classes)

2.0.0-beta1
[1 2 2 ... 3 2 2]
5


In [64]:
max_sentence_length = max([len(i) for i in train_sentences])
print(max_sentence_length)

48


In [0]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Dense,Dropout,Embedding,LSTM, Bidirectional, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
import os

In [0]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sentences)

In [67]:
train_set = tokenizer.texts_to_sequences(train_sentences)
test_set = tokenizer.texts_to_sequences(test_sentences)

train_set = sequence.pad_sequences(train_set, maxlen=max_sentence_length)
test_set = sequence.pad_sequences(test_set, maxlen=max_sentence_length)

num_words = len(tokenizer.word_index) + 1

print(len(train_set), len(test_set))

156060 66292


In [0]:
def split_train_test(train_set, classification, test_size = 0.2):
    train = []
    for example, sentiment in tqdm(zip(train_set, classification)):
        train.append([(example), (sentiment)])
    split = int(test_size*len(train))
    train_set = np.array(train[split:])
    val_set = np.array(train[:split])
    X_train = list(train_set[:,0])
    y_train = list(train_set[:,1])
    X_val = list(val_set[:,0])
    y_val = list(val_set[:,1])
    return np.array(X_train), np.array(X_val), np.array(y_train), np.array(y_val)    
        

In [69]:
X_train, X_val, y_train, y_val = split_train_test(train_set, train_y, test_size=0.05)

156060it [00:00, 373593.93it/s]


In [70]:
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

(148257, 48) (148257,) (7803, 48) (7803,)


In [0]:
with open('sentiment_setedited.pickle','wb') as f:
    pickle.dump([X_train,X_val,y_train,y_val],f)

# Train Model

In [72]:
#!pip install -q tf-nightly-gpu-2.0-preview

%load_ext tensorboard
import time

early_stopping = EarlyStopping(min_delta = 0.001, mode = 'max', monitor='val_accuracy', patience = 2)

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [73]:
lstm_layer = 1
dense_layer = 2


log_dir = f"logs/fit/{dense_layer}-dense-{lstm_layer}-lstm-{int(time.time())}"
tensorboard = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)


model = Sequential()
model.add(Embedding(num_words, 100, input_length=max_sentence_length))
model.add(LSTM(128, dropout = 0.2, recurrent_dropout=0.2, return_sequences=False))
model.add(Dense(32, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation="softmax"))

model.compile(loss='sparse_categorical_crossentropy',
            optimizer=Adam(lr=0.0001),
            metrics=['accuracy'])

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 48, 100)           1506200   
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dense_6 (Dense)              (None, 32)                4128      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 5)                 165       
Total params: 1,627,741
Trainable params: 1,627,741
Non-trainable params: 0
_________________________________________________________________


In [74]:
history = model.fit(X_train, y_train, epochs=15, batch_size=256,
                    validation_data=(X_val, y_val), callbacks=[ tensorboard])


Train on 148257 samples, validate on 7803 samples
Epoch 1/15
   256/148257 [..............................] - ETA: 4:16 - loss: 1.6167 - accuracy: 0.0742

W0811 01:10:14.251332 139750283151232 callbacks.py:241] Method (on_train_batch_end) is slow compared to the batch update (0.301438). Check your callbacks.


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [75]:
%tensorboard --logdir logs/fit

In [76]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import json
import io
model.save('/content/drive/My Drive/ML/sentimentmodel4.h5')
tokenizer_json = tokenizer.to_json()

with io.open(f'tokenizer{time.time()}.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

# Make Predictions

In [0]:
predictions = model.predict_classes(test_set)

In [79]:
#remove non alphanumeric characters
s = "I hate this movie. there was no plot and no character development. It sucked SHIT"
replaced = re.sub(r'[^a-zA-z0-9\s]','', s.lower())

#tokenize words
words = word_tokenize(replaced)

#remove common words
#filtered_sentence = [w for w in words if not w in stop_words]

#lemmatize words
#lexicon = filtered_sentence
lexicon = [lemmatizer.lemmatize(i) for i in words]
t = tokenizer.texts_to_sequences([lexicon])
p =  sequence.pad_sequences(t, maxlen=max_sentence_length)
print(p)

pr = model.predict_classes(p)
print(pr)

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0   61  955   16   14   83   87   59  111
     4   59   37 1210    6 4086]]
[0]


In [80]:
print(s)

I hate this movie. there was no plot and no character development. It sucked SHIT


In [0]:
submission = pd.read_csv('sampleSubmission.csv',sep=',')
submission.Sentiment = predictions
submission.to_csv('mySubmission1.csv', index=False)

In [0]:
with open('tokenizer1.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)