# Sentimental Analysis with Pytorch on Smaller Subset of Data

To run these files please specify the file path to the folder

In [None]:
folder_path = '/content/drive/MyDrive/Colab Notebooks/381 Final Project/ML Final Project/'

In [None]:
import nltk as nl
import pandas as pd
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import re
import kagglehub
import numpy as np
import torch
import torch.nn as nn

path = kagglehub.dataset_download("kazanova/sentiment140")
nl.download('stopwords')
nl.download('wordnet')

from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jenni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jenni\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Defining a Logistic Regression class using Pytorch's neural network

In [None]:
class LogisticRegression(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return torch.sigmoid(self.linear(x))

Importing the Sentinment140 database (used for training and validation)

In [None]:
columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
sent_df_path = folder_path + "training.1600000.processed.noemoticon.csv"
sent_df = pd.read_csv(sent_df_path, encoding='latin-1', header=None, names=columns)

Cleaning the database to only include positive and negative labels

In [None]:
sent_df = sent_df[sent_df['target'].isin([0,4])]
sent_df['label'] = sent_df['target'].map({0: 0, 4: 1})

sent_df = sent_df.sample(250000, random_state=72)

X_train = sent_df['text'].tolist()
y_train = sent_df['label'].tolist()

Processing the lyrics of the spotify database into tokens. Used later when we predict the sentinment

In [None]:
save_path = folder_path + "spotifydata_translated_combined.csv"
spotify = pd.read_csv(save_path)

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
def preprocess(text):
    #removes section headers
    text = re.sub(r"\[[^\]]+\]", "", text)
    tokens = []

    for word in simple_preprocess(text, deacc=True):
        if word not in stop_words:
            tokens.append(lemmatizer.lemmatize(word))

    return tokens

spotify['tokens'] = spotify['lyrics'].astype(str).apply(preprocess)

  spotify = pd.read_csv("spotifydata_translated_combined.csv")


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

sent_df['tokens'] = sent_df['text'].apply(preprocess)
X_texts = sent_df['tokens'].apply(lambda tokens: ' '.join(tokens))
y = sent_df['label'].values

vectorizer = TfidfVectorizer(max_features=10000, min_df=10)
X = vectorizer.fit_transform(X_texts).toarray()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


Converting training/validation split into Pytorch Tensors for model training

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.reshape(-1, 1), dtype=torch.float32).to(device)

X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val.reshape(-1, 1), dtype=torch.float32).to(device)

In [None]:
model = LogisticRegression(input_dim=X.shape[1]).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.8)

num_epochs = 150
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")
    scheduler.step()

Epoch 1/150, Loss: 0.7233
Epoch 2/150, Loss: 0.6695
Epoch 3/150, Loss: 0.6574
Epoch 4/150, Loss: 0.6428
Epoch 5/150, Loss: 0.6302
Epoch 6/150, Loss: 0.6234
Epoch 7/150, Loss: 0.6215
Epoch 8/150, Loss: 0.6211
Epoch 9/150, Loss: 0.6198
Epoch 10/150, Loss: 0.6171
Epoch 11/150, Loss: 0.6136
Epoch 12/150, Loss: 0.6106
Epoch 13/150, Loss: 0.6086
Epoch 14/150, Loss: 0.6077
Epoch 15/150, Loss: 0.6072
Epoch 16/150, Loss: 0.6067
Epoch 17/150, Loss: 0.6059
Epoch 18/150, Loss: 0.6048
Epoch 19/150, Loss: 0.6037
Epoch 20/150, Loss: 0.6027
Epoch 21/150, Loss: 0.6021
Epoch 22/150, Loss: 0.6017
Epoch 23/150, Loss: 0.6014
Epoch 24/150, Loss: 0.6011
Epoch 25/150, Loss: 0.6006
Epoch 26/150, Loss: 0.6000
Epoch 27/150, Loss: 0.5995
Epoch 28/150, Loss: 0.5991
Epoch 29/150, Loss: 0.5988
Epoch 30/150, Loss: 0.5985
Epoch 31/150, Loss: 0.5983
Epoch 32/150, Loss: 0.5981
Epoch 33/150, Loss: 0.5978
Epoch 34/150, Loss: 0.5975
Epoch 35/150, Loss: 0.5972
Epoch 36/150, Loss: 0.5970
Epoch 37/150, Loss: 0.5968
Epoch 38/1

In [None]:
model.eval()
with torch.no_grad():
    preds = model(X_val_tensor)
    predicted_labels = (preds > 0.5).float()
    accuracy = (predicted_labels == y_val_tensor).float().mean().item()
    print(f"Accuracy: {accuracy:.4f}")


Validation Accuracy: 0.7354


Saving the model

In [None]:
import joblib
import torch

model_path = folder_path + "sentiment_model.pt"
vectorization_path = folder_path + "tfdif_vectorizer.pkl"

torch.save(model.state_dict(), model_path)
joblib.dump(vectorizer, vectorization_path)

['tfdif_vectorizer.pkl']

Applying model to spotify data, adding values (no labels, just a value) and then saving.

In [None]:
model_path = folder_path + "sentiment_model.pt"
vectorization_path = folder_path + "tfdif_vectorizer.pkl"

model = LogisticRegression(input_dim=10000)
model.load_state_dict(torch.load(model_path))
model.to(device)
model.eval()

vectorizer = joblib.load(vectorization_path)

In [None]:
def predict_sentiment(lyrics):
    tokens = preprocess(lyrics)
    text_vector = vectorizer.transform([' '.join(tokens)])
    text_tensor = torch.tensor(text_vector.toarray(), dtype=torch.float32)
    with torch.no_grad():
        output = model(text_tensor)
        score = torch.sigmoid(output).item()

    return score

In [None]:
spotify['senti_score'] = spotify['lyrics'].apply(predict_sentiment)

In [None]:
save_sentiment_path = folder_path + "2_spotify_sentiment.csv"
spotify.to_csv(save_sentiment_path)