In [23]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split

import gensim
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api

from keras.utils.np_utils import to_categorical
import matplotlib.pyplot as plt 
import keras
from time import time

In [24]:
df = pd.read_csv("data/preprocessed_train.csv")
df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"['id', 'have', 'responded', 'if', 'i', 'were',...",id have responded if i were going,neutral
1,549e992a42,"['soon', 'sad', 'i', 'will', 'miss', 'you', 'h...",sooo sad,negative
2,088c60f138,"['my', 'boss', 'is', 'bullying', 'me']",bullying me,negative
3,9642c003ef,"['what', 'interview', 'leave', 'me', 'alone']",leave me alone,negative
4,358bd9e861,"['sons', 'of', 'why', 'couldnt', 'they', 'put'...",sons of,negative
...,...,...,...,...
27476,4eac33d1c0,"['wish', 'we', 'could', 'come', 'see', 'u', 'o...",d lost,negative
27477,4f4c4fc327,"['ive', 'wondered', 'about', 'rake', 'to', 'th...",dont force,negative
27478,f67aae2310,"['yay', 'good', 'for', 'both', 'of', 'you', 'e...",yay good for both of you,positive
27479,ed167662a5,"['but', 'it', 'was', 'worth', 'it']",but it was worth it,positive


In [25]:
# for case 2 run this code (case 2 = selected text)
case = "case2-cnn"

#read data
df = pd.read_csv("data/preprocessed_train.csv")
df.text = df.selected_text.map(lambda x:str(x))
df.sentiment = df.sentiment.astype("category")
df.sentiment = df.sentiment.cat.codes

In [26]:
# train, val, test split
x_train, xtest, y_train, ytest = train_test_split(df.text.values, df.sentiment.values,stratify=df.sentiment.values, test_size=0.3,random_state=1)
y_train = to_categorical(y_train)
x_val = xtest[0:4122]
y_val = to_categorical(ytest[0:4122])
x_test = xtest[4122:]
y_test = ytest[4122:]


#tokenizing and padding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df.text.values)

X_train = tokenizer.texts_to_sequences(x_train)
X_val = tokenizer.texts_to_sequences(x_val)
X_test = tokenizer.texts_to_sequences(x_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_val = pad_sequences(X_val, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

word_index = tokenizer.word_index

In [27]:
from keras.models import Sequential
from keras import regularizers
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv1D
from keras import layers
from sklearn.metrics import accuracy_score, f1_score

In [28]:
#activation = ["selu", "elu", "sigmoid", "tanh"]
#optimizer = ["adam", "SGD", "RMSprop", "Adadelta"]
#epochs = [5,10,15,20]
#batchsize = [8,16,32,64,128]
activation = ["elu"]
optimizer = ['RMSprop']
epochs = [5]
batchsize = [8]

In [29]:
#using keras embedding

def cnn_keras(activation,optimizer,epochs,batchsize):
  embedding_dim = 100

  model = Sequential()
  model.add(layers.Embedding(input_dim=vocab_size, 
                          output_dim=embedding_dim,
                          input_length=maxlen))

  model.add(Conv1D(32, kernel_size=2, activation=activation))
  model.add(layers.MaxPooling1D(2))
  model.add(Conv1D(32, kernel_size=2, activation=activation))
  model.add(Dropout(0.25))
  model.add(Conv1D(16, kernel_size=2, activation=activation))
  model.add(layers.MaxPooling1D(2))
  model.add(Conv1D(16, kernel_size=2, activation=activation))
  model.add(Dropout(0.25))

  model.add(Flatten())

  model.add(Dense(64, activation='tanh'))
  model.add(Dropout(0.5))

  model.add(Dense(3, activation='softmax'))
  model.compile(optimizer=optimizer,
                loss='categorical_crossentropy',
                metrics=['accuracy'])
  history = model.fit(X_train, y_train,
                    epochs=epochs,
                    verbose=0,
                    validation_data=(X_val, y_val),
                    batch_size=batchsize)
  
  return history, model

In [30]:
# Experimenting using Keras Embeddings

sel_activation_final = 'elu'
sel_optimizer_final = 'RMSprop'
sel_epoch_final = 5
sel_batch_final = 8

# Training the Model
t0 = time()
history, model = cnn_keras(sel_activation_final,sel_optimizer_final,sel_epoch_final,sel_batch_final)
pred = np.argmax(model.predict(X_test), axis=-1)

print("test accuracy score = ",accuracy_score(y_pred=pred, y_true=y_test))
print("test f1 score = ",f1_score(y_pred=pred, y_true=y_test, average="weighted"))

t1 = time()
print("time taken is ", t1-t0)

test accuracy score =  0.8391947610962891
test f1 score =  0.8394404075174307
time taken is  99.58482360839844


In [31]:
inp = ["this is a happy tweet", "sad tweet"]
X_test1 = tokenizer.texts_to_sequences(inp)
X_test1 = pad_sequences(X_test1, padding='post', maxlen = maxlen)
print(np.argmax(model.predict(X_test1), axis = -1))

[2 0]
