<a href="https://colab.research.google.com/github/s-jainr/deep-learning-sp23/blob/main/Deep_Learning_Final_Project_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!pip install pytreebank
import pytreebank
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
import pandas as pd

# Load the dataset from disk
dataset = pytreebank.load_sst("/path/to/sentiment/")

# Extract the train, validation, and test data
train_data = dataset['train'][:1000]
validation_data = dataset['dev']
test_data = dataset['test']

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [9]:
# Build the vocabulary
vocab = set()
for sentence in train_data:
    sentence = sentence.to_labeled_lines()[0][1]
    words = sentence.lower().split()
    for word in words:
        vocab.add(word)

# Create the word-to-index and index-to-word mappings
word_to_index = {}
index_to_word = {}
for i, word in enumerate(vocab):
    word_to_index[word.lower()] = i
    index_to_word[i] = word.lower()

# Convert the sentences to sequences of word indices
train_sequences = []
for sentence in train_data:
    sentence = sentence.to_labeled_lines()[0][1]
    words = sentence.lower().split()
    sequence = [word_to_index[word.lower()] for word in words]
    train_sequences.append(sequence)

# Pad the sequences to a fixed length
max_length = max(len(sequence) for sequence in train_sequences)
train_sequences = np.array([sequence + [0]*(max_length-len(sequence)) for sequence in train_sequences])

# Convert the labels to one-hot vectors
train_labels = np.zeros((len(train_data), 5))
for i, sentence in enumerate(train_data):
    train_labels[i][sentence.label] = 1

In [10]:
train_x = []
for seq in train_sequences:
  matrix = []
  for encoding in seq:
    seq_hot = np.zeros(len(vocab))
    seq_hot[encoding] = 1
    matrix.append(seq_hot)
  train_x.append(matrix)

train_x = np.array(train_x)

train_x.shape

(1000, 52, 4806)

In [11]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential()
model.add(Conv1D(32, 3, activation='relu', input_shape=(52, 4806)))
model.add(MaxPooling1D(2))
model.add(Conv1D(64, 3, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Conv1D(128, 3, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dense(5, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_6 (Conv1D)           (None, 50, 32)            461408    
                                                                 
 max_pooling1d_6 (MaxPooling  (None, 25, 32)           0         
 1D)                                                             
                                                                 
 conv1d_7 (Conv1D)           (None, 23, 64)            6208      
                                                                 
 max_pooling1d_7 (MaxPooling  (None, 11, 64)           0         
 1D)                                                             
                                                                 
 conv1d_8 (Conv1D)           (None, 9, 128)            24704     
                                                                 
 max_pooling1d_8 (MaxPooling  (None, 4, 128)          

In [13]:
es = EarlyStopping(monitor='accuracy', patience=5, restore_best_weights=True)

network_history = model.fit(train_x, train_labels, epochs=1000, batch_size=32, callbacks = [es])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000


In [17]:
# Use the model to predict the sentiment of a new sentence

correct = 0
incorrect = 0

for sentence in test_data:
  sentence = sentence.to_labeled_lines()[0]
  new_sentence = sentence[1]
  words = new_sentence.lower().split()
  sequence = [word_to_index.get(word, 0) for word in words]
  sequence = sequence + [0]*(max_length-len(sequence))

  matrix = np.zeros((len(sequence), len(vocab)))
  for i in range(len(sequence)):
    matrix[i][sequence[i]] = 1

  sequence = matrix

  predictions = model.predict(np.array([sequence]))
  sentiment = np.argmax(predictions)

  correctly_identified = sentiment == sentence[0]
  if correctly_identified:
    correct += 1
  else:
    incorrect += 1
  
  accuracy = correct / (correct + incorrect)

  # Print the predicted sentiment
  sentiment_labels = ["negative", "somewhat negative", "neutral", "somewhat positive", "positive"]
  # if not correctly_identified:
  #   print(f"The sentiment of\n\t{new_sentence}"
  #         f"\n\tmodel predicts: {sentiment_labels[sentiment]}, labeled as {sentiment_labels[sentence[0]]}"
  #         f"\n\t{'correctly' if correctly_identified else 'incorrectly'} indentified")
  print(f"overall accuracy: {accuracy:.2f}")

  if (correct + incorrect) > 100 and accuracy < 0.5:
    break

overall accuracy: 0.00
overall accuracy: 0.00
overall accuracy: 0.00
overall accuracy: 0.00
overall accuracy: 0.00
overall accuracy: 0.00
overall accuracy: 0.00
overall accuracy: 0.00
overall accuracy: 0.11
overall accuracy: 0.20
overall accuracy: 0.18
overall accuracy: 0.17
overall accuracy: 0.23
overall accuracy: 0.21
overall accuracy: 0.20
overall accuracy: 0.25
overall accuracy: 0.29
overall accuracy: 0.28
overall accuracy: 0.32
overall accuracy: 0.35
overall accuracy: 0.38
overall accuracy: 0.41
overall accuracy: 0.43
overall accuracy: 0.42
overall accuracy: 0.40
overall accuracy: 0.38
overall accuracy: 0.37
overall accuracy: 0.36
overall accuracy: 0.34
overall accuracy: 0.33
overall accuracy: 0.35
overall accuracy: 0.34
overall accuracy: 0.33
overall accuracy: 0.32
overall accuracy: 0.31
overall accuracy: 0.31
overall accuracy: 0.30
overall accuracy: 0.29
overall accuracy: 0.28
overall accuracy: 0.28
overall accuracy: 0.27
overall accuracy: 0.26
overall accuracy: 0.26
overall acc