In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.layers import Conv1D, Dropout, SpatialDropout1D
from tensorflow.keras.layers import MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import nltk
import re

In [4]:
df = pd.read_csv('twitter30k.csv')

In [5]:
X = df['twitts'].tolist()
y = df['sentiment']

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [9]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
x_train = []
for sent in X:
    sent = re.sub("[^a-zA-Z]", " ", sent)
    sent = sent.lower().split()
    sent = [lemmatizer.lemmatize(word) for word in sent if word not in set(stop_words)]
    sent = " ".join(sent)
    x_train.append(sent)

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

In [12]:
vocab_size = len(tokenizer.word_index) + 1

In [13]:
encoded_texts = tokenizer.texts_to_sequences(x_train)

In [14]:
testing_list = []
for l in x_train:
    testing_list.append(len(l.split(' ')))
max_sentence_length = max(testing_list)

In [15]:
padded_X = pad_sequences(encoded_texts, maxlen=max_sentence_length, padding='post', truncating='post')

In [16]:
X_train, X_test, y_train, y_test = train_test_split(padded_X, y, test_size=0.15, random_state=42)

In [17]:
embedding_features = 300

In [18]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_features, input_length=max_sentence_length))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(64, recurrent_activation='relu', recurrent_dropout=0.2, return_sequences=True))
model.add(LSTM(32, recurrent_activation='relu', recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))



In [19]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [21]:
hist = model.fit(X_train, y_train, batch_size=32, epochs=10, verbose=1, validation_data=(X_test, y_test))

Epoch 1/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 25ms/step - accuracy: 0.4950 - loss: 0.6932 - val_accuracy: 0.5571 - val_loss: 0.6867
Epoch 2/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 16ms/step - accuracy: 0.5917 - loss: 0.6693 - val_accuracy: 0.6300 - val_loss: 0.6477
Epoch 3/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 16ms/step - accuracy: 0.6520 - loss: 0.6273 - val_accuracy: 0.6353 - val_loss: 0.6318
Epoch 4/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 16ms/step - accuracy: 0.6647 - loss: 0.6258 - val_accuracy: 0.6367 - val_loss: 0.6397
Epoch 5/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 16ms/step - accuracy: 0.6600 - loss: 0.6216 - val_accuracy: 0.6342 - val_loss: 0.6353
Epoch 6/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 16ms/step - accuracy: 0.6728 - loss: 0.6028 - val_accuracy: 0.6438 - val_loss: 0.6323
Epoch 7/10
[1m7

In [25]:
model.save("sentiment_lstm_model.h5")



In [26]:
#Testing
from tensorflow.keras.models import load_model

lstm_model = load_model('sentiment_lstm_model.h5')



In [27]:
sentiments = ['negative', 'positive']

In [28]:
def text_preprocess(text):
    encoded = tokenizer.texts_to_sequences(text)
    padded = pad_sequences(encoded, maxlen=max_sentence_length, padding='post', truncating='post')
    return padded

In [31]:
tweet1 = ['i want to kill myself'] #neg sentiment 0
tweet2 = ['thank you very much'] #pos sentiment 1
txt1 = text_preprocess(tweet1)
txt2 = text_preprocess(tweet2)
output1 = sentiments[(lstm_model.predict(txt1) > 0.5).astype("int32")[0][0]]
output2 = sentiments[(lstm_model.predict(txt2) > 0.5).astype("int32")[0][0]]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step


In [32]:
print(output1, output2)

negative positive
