# Importing all necessary libraries and our data

In [25]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [26]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

In [27]:
df = pd.read_csv('/content/gdrive/MyDrive/TheSocialDilemma/TheSocialDilemma.csv')

# Spliting our data into 2 equal parts

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['Sentiment'], test_size=0.50)

## Removing neutral comments

In [29]:
X_train = X_train[(df.Sentiment == 'Positive') | (df.Sentiment == 'Negative')]
X_test = X_test[(df.Sentiment == 'Positive') | (df.Sentiment == 'Negative')]
y_train = y_train[(df.Sentiment == 'Positive') | (df.Sentiment == 'Negative')]
y_test = y_test[(df.Sentiment == 'Positive') | (df.Sentiment == 'Negative')]


## Cleaninig text

In [30]:
stop_words = stopwords.words("english")

In [31]:
def clean_text(text):
    text = text.lower()
    text = re.sub("@\S+", " ", text)
    text = re.sub("https*\S+", " ", text)
    text = re.sub("#\S+", " ", text)
    text = re.sub("\d", " ", text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\s{2,}',' ', text)
    text = ' '.join([word for word in text.split(' ') if word not in stop_words])
    return text

In [32]:
X_train = X_train.apply(lambda x:clean_text(x))
X_test = X_test.apply(lambda x:clean_text(x))

## Converting "Positive" and "Negative" into 1.0 and 0.0 for further calculations

In [33]:
def MakingLabel(text):
  if text == "Positive":
    return 1.0
  if text == "Negative":
    return 0.0

In [34]:
y_train = y_train.apply(lambda x:MakingLabel(x))
y_test = y_test.apply(lambda x:MakingLabel(x))

## Making a dictionary of the whole text to create sequences

In [35]:
text = ''
for i in X_train:
    text += i + ' '
for i in X_test:
    text += i + ' '

In [36]:
def tokenization(text):
    tokens = word_tokenize(text)
    return tokens

In [37]:
def lemmatizer(text):
    lemm_text = [WordNetLemmatizer().lemmatize(word) for word in text]
    return lemm_text

In [38]:
dict_full = list(set(lemmatizer(tokenization(text))))

## Making sequences

In [39]:
def sequences(text):
    sequence = [dict_full.index(i) for i in lemmatizer(tokenization(text))]
    return sequence

In [40]:
X_train = X_train.apply(lambda x:sequences(x))
X_test = X_test.apply(lambda x:sequences(x))

## Vectorizing sequences

In [41]:
def vectorize_sequences(sequences, dimension=11000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        for j in sequence:
            results[i, j] = 1.
    return results

In [42]:
X_train = vectorize_sequences(X_train)
X_test = vectorize_sequences(X_test)

In [43]:
y_train = np.asarray(y_train).astype("float32")
y_test = np.asarray(y_test).astype("float32")

# Making our model

In [44]:
maxlen = 200

X_train = keras.preprocessing.sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = keras.preprocessing.sequence.pad_sequences(X_test, maxlen=maxlen)

In [45]:
emb_dim = 128
max_features = 11000
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)

model = keras.Sequential()
model.add(layers.Embedding(max_features, emb_dim))
model.add(layers.LSTM(128))
model.add(layers.Dense(1, activation='sigmoid'))

In [46]:
model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=["accuracy"])

history = model.fit(X_train,
                    y_train,
                    epochs=5,
                    batch_size=128,
                    validation_split=0.2,
                    callbacks=[callback],
                    shuffle = True,
                    verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [47]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         1408000   
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,539,713
Trainable params: 1,539,713
Non-trainable params: 0
_________________________________________________________________
None


In [48]:
score = model.evaluate(X_test, y_test, verbose=0) 
print('Test score:', score[0]) 
print('Test accuracy:', score[1])

Test score: 0.5937301516532898
Test accuracy: 0.7240274548530579
