In [6]:
#Imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

### Loading the Data

Data is loaded from each respective .csv file using panda's read_csv() function. Each dataset has all data except for text and sentiment label dropped

In [7]:
#Load datasets
# movie dataset
mov_data = pd.read_csv('data/movie.csv', delimiter=',', quotechar='"', encoding='utf-8', on_bad_lines='skip')
mov_data = mov_data.to_numpy()
mov_data.shape
mov_X = mov_data[1:, 0] # all rows except the first one, text column
mov_y = mov_data[1:, 1] # all rows except the first one, label column

# chat gpt dataset
cgpt_data = pd.read_csv('data/chatgpt_sentiment_analysis.csv', delimiter=',', quotechar='"', encoding='utf-8', on_bad_lines='skip')
cgpt_data = cgpt_data.to_numpy()
cgpt_data.shape
cgpt_X = cgpt_data[1:, 1] # all rows except the first one, text column
cgpt_y = cgpt_data[1:, 2] # all rows except the first one, label column

# social media dataset
sm_data = pd.read_csv('data/soc_med_sentiment_analysis.csv', delimiter=',', quotechar='"', encoding='utf-8', on_bad_lines='skip')
sm_data = sm_data.to_numpy()
sm_data.shape
sm_X = sm_data[1:, 4] # all rows except the first one, text column
sm_y = sm_data[1:, 5] # all rows except the first one, label column

### Combining and Vectorization
The X dataframes, representing strings of text, were concatenated together so all strings across the three datasets were in one location.
Due to the fact that neural models cannot read strings of text on their own, this data then  had to be vectorized. This process, being out of the scope of this assignment to accomplish, was done using sklearn's TfidfVectorizer() object.

In [8]:
#Combine X and Y dataframes, then vectorize
X = np.concatenate([mov_X, cgpt_X, sm_X])
y = np.concatenate([mov_y, cgpt_y, sm_y])

vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(X).toarray()

### Label conversion
Y values from the datasets, representing sentiments, were converted to numerical representations to simplify calculations and create uniformity.

In [9]:
#Convert labels in y
y = np.where(y == "good", 1, np.where(y == "neutral", 0, np.where(y == "bad", -1, y)))
y = np.where(y == "positive", 1, np.where(y == "neutral", 0, np.where(y == "negative", -1, y)))

### Preprocessing
Data is shuffled, halved, and then one half of the data is split into the training (80%) and testing (20%) arrays.

In [10]:
#Preprocessing
def preprocess(X, y):
    # Normalize the data
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Shuffle the data
    indices = []

    for i in range(len(X)):
        indices.append(i)

    np.random.shuffle(indices)
    X = X[indices]
    y = y[indices]

    #Half the amount of data
    split = int(0.5 * X.shape[0])
    X = X[:split]
    y = y[:split]

    # Split data: training 80%, testing 20%
    split = int(0.8 * X.shape[0])
    X_train, X_test = X[:split], X[split:]
    y_train, y_test = y[:split], y[split:]

    # return X, X_train, X_test, y, y_train, y_test
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = preprocess(X, y)

In [11]:
#Model
class BackPropagation:
    def __init__(self, input_size, hidden_size, output_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        # Initialize weights and biases
        self.b1 = np.zeros((1, self.hidden_size))
        self.b2 = np.zeros((1, self.output_size))
        self.W1 = np.random.randn(self.input_size, self.hidden_size) * np.sqrt(2 / self.input_size)
        self.W2 = np.random.randn(self.hidden_size, self.output_size) * np.sqrt(2 / self.hidden_size)


    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def sigmoid_derivative(self, x):
        return x * (1 - x)

    def relu(self, x):
        return x * (x >= 0)

    def relu_derivative(self, x):
        return 1. * (x > 0)

    def forward(self, X):
        # Hidden layer
        hidden_activation = self.relu(np.dot(X, self.W1) + self.b1)
        # Output layer
        output = np.dot(hidden_activation, self.W2) + self.b2
        return output

    def backward(self, X, y, output, rate):
        # Calculate error (output)
        output_error = (y - output)

        # Calculate hidden activations
        hidden = self.relu(np.dot(X, self.W1) + self.b1)
        print(f"Hidden max: {np.max(hidden)}, min: {np.min(hidden)}")

        # Calculate error (hidden)
        hidden_error = np.dot(output_error, self.W2.T)
        hidden_delta = hidden_error * self.relu_derivative(hidden)
        print(f"Hidden error max: {np.max(hidden_error)}, min: {np.min(hidden_error)}")
        print(f"Hidden delta max: {np.max(hidden_delta)}, min: {np.min(hidden_delta)}")

        # Update weights
        self.W2 = self.W2 + rate*(np.dot(hidden.T, output_error))
        self.b2 = self.b2 + rate*(np.sum(output_error, axis=0, keepdims=True))

        self.W1 = self.W1 + rate*(np.dot(X.T, hidden_delta))
        self.b1 = self.b1 + rate*(np.sum(hidden_delta, axis=0, keepdims=True))
        
    def train(self, X, y, learning_rate, epochs):
        y = y.reshape(-1,1)

        for _ in range(epochs):
            print(f"W1 max: {np.max(self.W1)}, min: {np.min(self.W1)}")
            print(f"W2 max: {np.max(self.W2)}, min: {np.min(self.W2)}")
            print(f"B1 max: {np.max(self.b1)}, min: {np.min(self.b1)}")
            print(f"B2 max: {np.max(self.b2)}, min: {np.min(self.b2)}")
            output = self.forward(X)
            self.backward(X,y, output, learning_rate)

    def predict(self, X):
        return self.forward(X)

### Training
Parameters are set and a backpropagation model is initialized to train on x and y training datasets

In [12]:
#backprop training
input_size = X_train.shape[1]
hidden_size = 20
output_size = 1

bp = BackPropagation(input_size, hidden_size, output_size)
bp.train(X_train, y_train, learning_rate=0.001, epochs=4)

W1 max: 0.20109642545871534, min: -0.19136706661325897
W2 max: 0.6422843975124137, min: -0.5976149131603825
B1 max: 0.0, min: 0.0
B2 max: 0.0, min: 0.0
Hidden max: 10.508063788641238, min: -0.0
Hidden error max: 6.81963663586679, min: -6.345345724905494
Hidden delta max: 4.550848599986745, min: -6.345345724905494
W1 max: 13.714121562929698, min: -10.562502356067185
W2 max: 61.05811150497766, min: -25.740352584404736
B1 max: 1.4731273942757246, min: -17.7656700251416
B2 max: 20.437176272959924, min: 20.437176272959924
Hidden max: 914.6188442423286, min: -0.0
Hidden error max: 1971189.6849890135, min: -4675814.722772738
Hidden delta max: 405681.7539953504, min: -4675814.722772738
W1 max: 45930970.65919723, min: -79086371.72587079
W2 max: 129085059.31170666, min: -291868197.6792114
B1 max: 3359550.0942404876, min: -103745340.41492735
B2 max: -1425929.7283347908, min: -1425929.7283347908
Hidden max: 3513411127.4536915, min: -0.0
Hidden error max: 4.5912482530808976e+26, min: -1.03810569547

### Predictions
Final predictions are made on both the training and testing datasets to make comparisons

In [13]:
#Predictions
train_predictions = bp.predict(X_train)
test_predictions = bp.predict(X_test)

### Results
Predictions are flattened to align with the size of y-value arrays and then compared using mean squared error. These results are printed.

In [14]:
train_predictions = train_predictions.flatten()
test_predictions = test_predictions.flatten()

In [15]:
train_mse = np.mean((train_predictions - y_train) ** 2)
test_mse = np.mean((test_predictions - y_test) ** 2)

print(f"Train MSE: {train_mse}")
print(f"Test MSE: {test_mse}")

OverflowError: (34, 'Result too large')