In [38]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [39]:
import numpy as np 
import pandas as pd 
import bz2
import gc
import re
import os
import csv
import re
import warnings
warnings.filterwarnings("ignore")
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [40]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import KeyedVectors
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences 
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SimpleRNN
from keras.layers import Dense, Dropout, Activation
from keras.layers import Conv1D, MaxPooling1D, Flatten
from keras.utils.np_utils import to_categorical
from keras.models import Model
from keras.engine import Input

### Reading the data

In [41]:
data = pd.read_csv('/content/drive/My Drive/sa_data.csv')

In [42]:
data.head()

Unnamed: 0,text,sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,Neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,Neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive


### Cleaning the data

In [43]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
#Preprocessing - removing unwanted characters, tokenization, stop-word removal
def clean_data(tweet):
    x = re.sub("[^a-zA-Z0-9]", " ",tweet) 
    x = x.lower().split()                             
    stops = set(stopwords.words("english"))                  
    words = [w for w in x if not w in stops] 
    return( " ".join(words))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
data['text'] = data['text'].apply(lambda x: clean_data(x))
print(data.head())

                                                text sentiment
0  rt nancyleegrahn everyone feel climate change ...   Neutral
1  rt scottwalker catch full gopdebate last night...  Positive
2  rt tjmshow mention tamir rice gopdebate held c...   Neutral
3  rt robgeorge carly fiorina trending hours deba...  Positive
4  rt danscavino gopdebate w realdonaldtrump deli...  Positive


In [45]:
for i in range(len(data)):
    data['text'][i] = data['text'][i].replace('rt ','')

print(data.head())

                                                text sentiment
0  nancyleegrahn everyone feel climate change que...   Neutral
1  scottwalker catch full gopdebate last night sc...  Positive
2  tjmshow mention tamir rice gopdebate held clev...   Neutral
3  robgeorge carly fiorina trending hours debate ...  Positive
4  danscavino gopdebate w realdonaldtrump deliver...  Positive


In [46]:
#Converting the dataframe into list
reviews = data['text'].tolist()
sentiment = data['sentiment'].tolist()

### Preparing the Corpus

In [47]:
corpus = []
for i in range(len(reviews)):
    corpus.append(word_tokenize(reviews[i]))

### Tokenization

In [48]:
#Tokenization
max_features = 1200 
tokenizer = Tokenizer(nb_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)

In [49]:
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X, padding = 'post') #Zero padding at the end of the sequence

### Label Encoding

In [50]:
le = preprocessing.LabelEncoder()
Y = le.fit_transform(sentiment)

In [51]:
Y = to_categorical(Y)
print(Y)

[[0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 ...
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]]


### Train-test split

In [52]:
#Splitting the data into train data and test data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

### Architecture 1 : CNN + Bidirectional RNN

In [102]:
from keras.layers import Bidirectional, SimpleRNN

embed_dim = 500
hidden_layer = 100

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
model.add(Conv1D(100,7))
model.add(Bidirectional(SimpleRNN(hidden_layer,activation='tanh')))
#model.add(Conv1D(100,5))
#model.add(SimpleRNN(hidden_layer))
model.add(Flatten())
model.add(Dense(128))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_26 (Embedding)     (None, 24, 500)           600000    
_________________________________________________________________
conv1d_28 (Conv1D)           (None, 18, 100)           350100    
_________________________________________________________________
bidirectional_25 (Bidirectio (None, 200)               40200     
_________________________________________________________________
flatten_18 (Flatten)         (None, 200)               0         
_________________________________________________________________
dense_35 (Dense)             (None, 128)               25728     
_________________________________________________________________
dense_36 (Dense)             (None, 3)                 387       
Total params: 1,016,415
Trainable params: 1,016,415
Non-trainable params: 0
___________________________________________

In [103]:
batch_size = 32
model.fit(X_train, y_train, epochs = 10, batch_size=batch_size, validation_split = 0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ff73bd6bfd0>

In [104]:
score = model.evaluate(X_test, y_test, verbose = 2, batch_size = batch_size)
print("Accuracy: %.2f" % (score[1]*100))

87/87 - 1s - loss: 1.3299 - accuracy: 0.6494
Accuracy: 64.94


### Architecture 2 : CNN

In [105]:
model2 = Sequential()
model2.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
model2.add(Conv1D(100,7))
model2.add(Flatten())
model2.add(Dense(3,activation='softmax'))
model2.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model2.summary())

Model: "sequential_27"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_27 (Embedding)     (None, 24, 500)           600000    
_________________________________________________________________
conv1d_29 (Conv1D)           (None, 18, 100)           350100    
_________________________________________________________________
flatten_19 (Flatten)         (None, 1800)              0         
_________________________________________________________________
dense_37 (Dense)             (None, 3)                 5403      
Total params: 955,503
Trainable params: 955,503
Non-trainable params: 0
_________________________________________________________________
None


In [106]:
batch_size = 32
model2.fit(X_train, y_train, epochs = 10, batch_size=batch_size, validation_split = 0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ff73b9aeef0>

In [107]:
score = model2.evaluate(X_test, y_test, verbose = 2, batch_size = batch_size)
print("Accuracy: %.2f" % (score[1]*100))

87/87 - 1s - loss: 1.2051 - accuracy: 0.6328
Accuracy: 63.28


### Architecture 3 : Bidirectional LSTM

In [108]:
embed_dim = 500
hidden_layer = 100

model3 = Sequential()
model3.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
model3.add(Bidirectional(LSTM(hidden_layer,activation='tanh')))
model3.add(Flatten())
model3.add(Dense(64))
model3.add(Dense(3,activation='softmax'))
model3.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model3.summary())

Model: "sequential_28"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_28 (Embedding)     (None, 24, 500)           600000    
_________________________________________________________________
bidirectional_26 (Bidirectio (None, 200)               480800    
_________________________________________________________________
flatten_20 (Flatten)         (None, 200)               0         
_________________________________________________________________
dense_38 (Dense)             (None, 64)                12864     
_________________________________________________________________
dense_39 (Dense)             (None, 3)                 195       
Total params: 1,093,859
Trainable params: 1,093,859
Non-trainable params: 0
_________________________________________________________________
None


In [109]:
batch_size = 32
model3.fit(X_train, y_train, epochs = 10, batch_size=batch_size, validation_split = 0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ff7395bc7f0>

In [110]:
score = model3.evaluate(X_test, y_test, verbose = 2, batch_size = batch_size)
print("Accuracy: %.2f" % (score[1]*100))

87/87 - 2s - loss: 1.5377 - accuracy: 0.6389
Accuracy: 63.89


# Amazon Reviews Sentiment Analysis dataset obtained from Kaggle Challenges. 
https://www.kaggle.com/bittlingmayer/amazonreviews

The dataset consists of 2 classes which correspond to 2 sentiments expressed in the reviews of the customers, based on star ratings.

### Handling .bz2 file format

In [None]:
train_file = bz2.BZ2File('/content/drive/My Drive/train.ft.txt.bz2')
test_file = bz2.BZ2File('/content/drive/My Drive/test.ft.txt.bz2')

In [None]:
type(train_file)

bz2.BZ2File

In [None]:
train_file_lines = train_file.readlines()
test_file_lines = test_file.readlines()

### Train and Test data preparation

In [None]:
train_data = [x.decode('utf-8') for x in train_file_lines]
test_data = [x.decode('utf-8') for x in test_file_lines]

In [None]:
print(len(train_data))

3600000


In [None]:
train_data[1:5]

["__label__2 The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny.\n",
 '__label__2 Amazing!: This soundtrack is my favorite music of all time, hands down. The intense sadness of "Prisoners of Fate" (which means all the more if you\'ve played the game) and the hope in "A Distant Promise" and "Girl who Stole the Star" have been an important inspiration to me personally throughout my teen years. The higher energy tracks like "Chrono Cross ~ Time\'s Scar~", "Time of the Dreamwatch", and "Chronomantique" (indefinably remeniscent of Chrono Tri

In [None]:
traindata1 = [x.replace("__label__2","1") for x in train_data]
traindata0 = [x.replace("__label__1","0") for x in traindata1]

In [None]:
len(traindata0)

3600000

In [None]:
from sklearn.utils import shuffle
traindata0 = shuffle(traindata0)

In [None]:
traindata = traindata0[0:36000]

In [None]:
len(traindata)

36000

In [None]:
traindata[1:5]

["1 The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny.\n",
 '1 Amazing!: This soundtrack is my favorite music of all time, hands down. The intense sadness of "Prisoners of Fate" (which means all the more if you\'ve played the game) and the hope in "A Distant Promise" and "Girl who Stole the Star" have been an important inspiration to me personally throughout my teen years. The higher energy tracks like "Chrono Cross ~ Time\'s Scar~", "Time of the Dreamwatch", and "Chronomantique" (indefinably remeniscent of Chrono Trigger) are all abso

In [None]:
train_labels = [x.split(" ")[0] for x in traindata]

In [None]:
len(train_labels)

36000

In [None]:
train = [re.split('^\d',x)[1] for x in traindata]

In [None]:
len(train)

36000

In [None]:
train[1:5]

[" It Works..: Didn't spontaneously combust or eat the dvds. Definitely the best deal I have found for a large capacity wallet\n",
 " coating came off: Anolon not impressed with the coating came off in no time didn't cook evenly I through this away long ago\n",
 " Same old same old: This is same old stuff everybody else talks about.You are what you think and you can do anything you set your mind to. The information in the book i can get that by wathching oprah for free and not get the smart ass comments about poor people that was in the book. Hes just another person that happened to get rich because he figured out how to scam people out of money at his semianrs.He makes it sound like people can start big business even if they are poor,so..i want to start flipping houses i live pay check to pay check am i suppose to pull the money out my ass? also, he's very arrogant when he talks about poor people.Get a persoanlity.I'm glad i bought this book used.\n",
 ' great planner with weak blades

In [None]:
# Data Prep
train_df = pd.DataFrame(train)
train_df.to_csv("train.csv", index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")

In [None]:
train_df.head()

Unnamed: 0,0
0,Stuning even for the non-gamer: This sound tr...
1,The best soundtrack ever to anything.: I'm re...
2,Amazing!: This soundtrack is my favorite musi...
3,Excellent Soundtrack: I truly like this sound...
4,"Remember, Pull Your Jaw Off The Floor After H..."


In [None]:
train_labels_df = pd.DataFrame(train_labels)
train_labels_df.to_csv("train_labels.csv", index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")

In [None]:
train_labels_df.head()

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1


In [None]:
#Preprocessing
def pre_process(review):
    words = review.lower() 
    return words                                            

In [None]:
traindf = train_df.apply(lambda x: pre_process(x))
traindf.head()

Unnamed: 0,0
0,stuning even for the non-gamer: this sound tr...
1,the best soundtrack ever to anything.: i'm re...
2,amazing!: this soundtrack is my favorite musi...
3,excellent soundtrack: i truly like this sound...
4,"remember, pull your jaw off the floor after h..."


In [None]:
review = []
for x in train:
  review.append(pre_process(x)) 

In [None]:
len(review)

36000

In [None]:
review[1:5]

[" it works..: didn't spontaneously combust or eat the dvds. definitely the best deal i have found for a large capacity wallet\n",
 " coating came off: anolon not impressed with the coating came off in no time didn't cook evenly i through this away long ago\n",
 " same old same old: this is same old stuff everybody else talks about.you are what you think and you can do anything you set your mind to. the information in the book i can get that by wathching oprah for free and not get the smart ass comments about poor people that was in the book. hes just another person that happened to get rich because he figured out how to scam people out of money at his semianrs.he makes it sound like people can start big business even if they are poor,so..i want to start flipping houses i live pay check to pay check am i suppose to pull the money out my ass? also, he's very arrogant when he talks about poor people.get a persoanlity.i'm glad i bought this book used.\n",
 ' great planner with weak blades

In [None]:
sentiment = train_labels

In [None]:
sentiment[1:5]

['1', '0', '0', '1']

### Generating Corpus

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
corpus = []
for i in range(len(review)):
    corpus.append(word_tokenize(review[i]))

In [None]:
len(corpus)

36000

### Fasttext for training word vectors and for Classification ( Accuracy 87.09% )

In [None]:
# Data Prep
data = pd.DataFrame(train_data[0:36000])
data.to_csv("train.txt", index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")

In [None]:
import fasttext

fasttext_model = fasttext.train_supervised('train.txt',label_prefix='__label__', thread=4, epoch = 10)
print(fasttext_model.labels, 'are the labels or targets the model is predicting')

['__label__2', '__label__1'] are the labels or targets the model is predicting


In [None]:
test = test_data[0:36000]

# To run the predict function, we need to remove the __label__1 and __label__2 from the testset.  
new = [w.replace('__label__2 ', '') for w in test]
new = [w.replace('__label__1 ', '') for w in new]
new = [w.replace('\n', '') for w in new]

In [None]:

# Use the predict function 
pred = fasttext_model.predict(new)

In [None]:
# Lets recode the actual targets to 1's and 0's from both the test set and the actual predictions  
labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test]
pred_labels = [0 if x == ['__label__1'] else 1 for x in pred[0]]

In [None]:
from sklearn.metrics import roc_auc_score

# run the accuracy measure. 
print(roc_auc_score(labels, pred_labels))

0.8709266736771692


### Word2Vec for training word vectors and Naive Bayes for Classification ( Accuracy 69.31%)

In [None]:
#Creating word embedding for the words. Embedding dimension = 100
model = Word2Vec(corpus, size=100, window=5, min_count=1)

In [None]:
#Voabulary list
model.wv.vocab

{'do': <gensim.models.keyedvectors.Vocab at 0x7f59be9f2cc0>,
 'not': <gensim.models.keyedvectors.Vocab at 0x7f59c6b91748>,
 'buy': <gensim.models.keyedvectors.Vocab at 0x7f59be99b9e8>,
 'this': <gensim.models.keyedvectors.Vocab at 0x7f59be99ba20>,
 'game': <gensim.models.keyedvectors.Vocab at 0x7f59be99ba58>,
 ':': <gensim.models.keyedvectors.Vocab at 0x7f59be99ba90>,
 'one': <gensim.models.keyedvectors.Vocab at 0x7f59be99bac8>,
 'of': <gensim.models.keyedvectors.Vocab at 0x7f59be99bb00>,
 'the': <gensim.models.keyedvectors.Vocab at 0x7f59be99bb38>,
 'worst': <gensim.models.keyedvectors.Vocab at 0x7f59be99bb70>,
 'games': <gensim.models.keyedvectors.Vocab at 0x7f59be99bba8>,
 'i': <gensim.models.keyedvectors.Vocab at 0x7f59be99bbe0>,
 'have': <gensim.models.keyedvectors.Vocab at 0x7f59be99bc18>,
 'ever': <gensim.models.keyedvectors.Vocab at 0x7f59be99bc50>,
 'played': <gensim.models.keyedvectors.Vocab at 0x7f59be99bc88>,
 '.': <gensim.models.keyedvectors.Vocab at 0x7f59be99bcc0>,
 'pla

In [None]:
model.wv["hopeful"]

array([-7.26336688e-02, -1.89959735e-01,  1.13285154e-01,  2.27760747e-01,
       -4.12630290e-01, -4.38349613e-04, -2.46337019e-02, -4.15330768e-01,
       -2.88858384e-01,  2.03475982e-01,  8.64369199e-02,  2.05347374e-01,
       -3.62314209e-02,  1.56122059e-01,  3.32149118e-01, -2.32776642e-01,
       -9.83110294e-02, -2.29135975e-01,  7.96781853e-02,  3.24289054e-01,
        1.15881786e-01,  2.55118459e-01,  1.37984350e-01, -1.53782964e-01,
       -2.35036481e-02, -2.05833331e-01,  5.03915608e-01,  2.33158559e-01,
        2.07676552e-02,  2.33417541e-01,  1.36494726e-01, -8.86520669e-02,
        1.09996283e-02,  2.97828466e-01, -1.14948608e-01,  5.03463931e-02,
       -9.77805704e-02, -2.18057543e-01,  1.60088852e-01,  8.90823901e-02,
       -7.01189339e-02, -6.62002489e-02,  2.40424514e-01,  1.83951645e-03,
       -1.28228188e-01,  1.20012492e-01,  2.47914605e-02, -3.03318143e-01,
        1.10787630e-01, -1.41838240e-02, -1.15367875e-01,  8.45850110e-02,
        3.87955364e-03,  

In [None]:
model.wv.most_similar("hopeful")

[('ommission', 0.813042938709259),
 ('reluctant', 0.8104925751686096),
 ('juiced', 0.8035626411437988),
 ('celeriac', 0.8022887706756592),
 ('allergic', 0.7970759868621826),
 ('troublesome', 0.7958637475967407),
 ('comp', 0.7956607341766357),
 ('shelia', 0.7935953736305237),
 ('scallops', 0.7893199324607849),
 ('seared', 0.7862469553947449)]

In [None]:
model.wv.doesnt_match("loving")

'i'

In [None]:
#Creating the input data
X = np.zeros((len(corpus),100)) #Initializing the X matrix with zeros
for i in range(len(corpus)):
    emb = [model.wv[w] for w in corpus[i]] #Create a list of word embeddings of the words in each sentence
    X[i] = np.mean(emb, axis=0) #Take the mean of the word embeddings of the words in a sentence because length of the sentences varies and the dimension of the features will increase with the increase in the number of words in the sentence

In [None]:
le = preprocessing.LabelEncoder()
Y = le.fit_transform(sentiment)

In [None]:
print(X.shape,Y.shape)

(36000, 100) (36000,)


In [None]:
#Splitting the data into train data and test data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(28800, 100) (28800,)
(7200, 100) (7200,)


In [None]:
#Printing the size of the train data, train label, test data and test label
print("Shape train data = ",np.shape(X_train))
print("Shape of train label = ",np.shape(Y_train))
print("Shape of test data = ",np.shape(X_test))
print("Shape of test label = ",np.shape(Y_test))

Shape train data =  (28800, 100)
Shape of train label =  (28800,)
Shape of test data =  (7200, 100)
Shape of test label =  (7200,)


In [None]:
clf = GaussianNB() #Initializing the classifier
clf.fit(X_train, Y_train) #Train the classifier using X_train and y_train

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
#Predict the class label for the test data using the trained model
y_pred = clf.predict(X_test)

In [None]:
#Computation of the accuracy score
accuracy = accuracy_score(Y_test, y_pred)
print('Accuracy = ','%.2f'%(accuracy*100))

Accuracy =  69.31


### CNN + LSTM

In [None]:
max_features = 8192
max_len = 128
embedding_vecor_length = 64

In [None]:
tk = keras.preprocessing.text.Tokenizer(num_words=max_len, char_level = False, filters=None, lower=False, split=None)
tk.fit_on_texts(X_train.tolist())

In [None]:
Xtrain = tk.texts_to_sequences(X_train.tolist())

In [None]:
# Importing keras modules
import tensorflow as tf
from tensorflow.python.keras import Sequential
from tensorflow.python.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, BatchNormalization, GlobalAveragePooling1D
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding


input_shape = (max_len,1)
model = Sequential()
model.add(Embedding(max_features, embedding_vecor_length, input_length=max_len))
model.add(Conv1D(filters=32,kernel_size= 7,activation= 'relu', padding='same', input_shape = input_shape))
model.add(BatchNormalization())
#model.add(Conv1D(filters=32,kernel_size= 3, activation= 'relu', padding='same'))
#model.add(BatchNormalization())
#model.add(Conv1D(filters=32,kernel_size= 3, activation= 'relu', padding='same'))
#model.add(BatchNormalization())
#model.add(Conv1D(2,1))
#model.add(GlobalAveragePooling1D())
model.add(LSTM(50))
#model.add(Conv1D(filters=32,kernel_size= 5))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 128, 64)           524288    
_________________________________________________________________
conv1d_24 (Conv1D)           (None, 128, 32)           14368     
_________________________________________________________________
batch_normalization_18 (Batc (None, 128, 32)           128       
_________________________________________________________________
lstm (LSTM)                  (None, 50)                16600     
_________________________________________________________________
flatten_6 (Flatten)          (None, 50)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 51        
Total params: 555,435
Trainable params: 555,371
Non-trainable params: 64
_______________________________________________

In [None]:
# Compiling, Training, Testing
import keras
from keras.preprocessing import sequence

Xtrain = sequence.pad_sequences(Xtrain, maxlen=max_len)
opt = tf.keras.optimizers.Adam(lr=0.001, amsgrad=False)
model.compile(optimizer = opt, loss = 'binary_crossentropy', metrics = ['accuracy']) #categorical for multiclass
history = model.fit(Xtrain,Y_train, epochs = 5, batch_size = 32)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
tk = keras.preprocessing.text.Tokenizer(num_words=max_len, char_level = False, filters=None, lower=False, split=None)
tk.fit_on_texts(X_test.tolist())
Xtest = tk.texts_to_sequences(X_test.tolist())
Xtest = sequence.pad_sequences(Xtest, maxlen=max_len)

loss, accuracy = model.evaluate(Xtest,Y_test)
print("\nLoss: %.2f, Accuracy: %.2f%%" % (loss, accuracy*100))


Loss: 0.70, Accuracy: 50.28%
