# Natural Language Process Analysis
Sentiment analysis and LDA topic identification. **Please make sure to insert the path to this folder here.**


In [None]:
folder_path = '/content/drive/MyDrive/Colab Notebooks/381 Final Project/ML Final Project/'

## Import Necessary Libraries

In [None]:
!pip install numpy==1.25.2



In [None]:
!pip install --upgrade gensim nltk

import pandas as pd
import numpy as np
import re
import ast
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim as gs
from gensim import corpora, models, similarities
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
import matplotlib.pyplot as plt

import nltk as nl
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nl.download('wordnet')
nl.download('stopwords')

import kagglehub
path = kagglehub.dataset_download("kazanova/sentiment140")

from google.colab import drive
drive.mount('/content/drive')



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Sentiment Analysis with Pytorch Logistic Regression (smaller dataset)

This code can be found in the Sentiment_Analysis_Pytorch.ipynb notebook.

## Sentiment Analysis with Sklearn Logistic Regression (entire dataset)

Train a sentimental analysis model using the [Sentiment140](https://www.kaggle.com/datasets/ferno2/training1600000processednoemoticoncsv?resource=download) which has labelled sentiment for large amounts of text

In [None]:
columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
sentiment140_path = folder_path + 'training.1600000.processed.noemoticon.csv'
sent_df = pd.read_csv(sentiment140_path, encoding='latin-1', header=None, names=columns)

In [None]:
# keep 0 and 4 (pos and neg)
sent_df = sent_df[sent_df['target'].isin([0,4])]
sent_df['label'] = sent_df['target'].map({0: 0, 4: 1})

sent_df = sent_df.sample(10000, random_state=72)

In [None]:
X_train = sent_df['text'].tolist()
y_train = sent_df['label'].tolist()

In [None]:
translated_data_path = folder_path + 'spotifydata_translated_combined.csv'
sentiment_df = pd.read_csv(translated_data_path)

  sentiment_df = pd.read_csv(translated_data_path)


In [None]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    #removes section headers
    text = re.sub(r"\[[^\]]+\]", "", text)
    tokens = []

    for word in simple_preprocess(text, deacc=True):
        if word not in stop_words:
            tokens.append(lemmatizer.lemmatize(word))

    return tokens

sentiment_df['tokens'] = sentiment_df['lyrics'].astype(str).apply(preprocess)

In [None]:
sent_df['tokens'] = sent_df['text'].apply(preprocess)
print(sent_df['tokens'])

486826                                 [talyuhh, miss, tooo]
250439     [jessemccartney, heyy, jesse, please, show, uk...
7702       [happy, opening, day, wish, could, getting, re...
34115                        [wobblybob, naa, family, issue]
854681     [quiet, lately, around, reason, http, www, nin...
                                 ...                        
436828                    [ayasawada, aw, loved, show, miss]
1008480    [tired, fuckidy, fuck, new, job, monday, final...
1398877    [cameronmarion, maybe, mean, change, new, razo...
517342      [netaddicts, dmi, say, rain, shower, cg, afraid]
323306     [deesos, xy, giirl, thought, thats, wut, reali...
Name: tokens, Length: 10000, dtype: object


 Using TF-IDF vectorization to project our spotify library into that space and then apply a logistic regression with our transformed data.

In [None]:
vectorizer = TfidfVectorizer(
    tokenizer=lambda t: preprocess(t),
    lowercase=False,
    min_df=5, max_df=0.8
)

X_train = vectorizer.fit_transform(sent_df['text'])
y_train = sent_df['label']

# Transform spotify lyrics into the same TF-IDF space and fill so transform won't brea
lyrics = sentiment_df['lyrics'].fillna("").tolist()
X_sentiment_df = vectorizer.transform(lyrics)



In [None]:
clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)

In [None]:
sentiment_df['Sentiment_Range'] = clf.predict_proba(X_sentiment_df)[:, 1]
sentiment_df.head()

Unnamed: 0,Country,Uri,Popularity,Title,Artist,Album/Single,Artist_followers,Explicit,Album,Release_date,...,Released_after_2017,LDA_Topic,Popu_max,Cluster,lyrics,language,Country_name,Genre_name,tokens,Sentiment_Range
0,1,https://open.spotify.com/track/1xy4apMFecGGkgB...,1598.45,chasing fire,Lauv,0,3587039.0,0,Chasing Fire,2018-03-29,...,1.0,6.0,96,global,31 ContributorsTranslationsPortuguêsChasing Fi...,en,Global,pop,"[fire, lyric, wanna, hate, cause, still, love,...",0.377025
1,2,https://open.spotify.com/track/1xy4apMFecGGkgB...,1369.6,chasing fire,Lauv,0,3587039.0,0,Chasing Fire,2018-03-29,...,1.0,6.0,103,english speaking and nordic,31 ContributorsTranslationsPortuguêsChasing Fi...,en,USA,pop,"[fire, lyric, wanna, hate, cause, still, love,...",0.377025
2,18,https://open.spotify.com/track/1xy4apMFecGGkgB...,500.0,chasing fire,Lauv,0,3587039.0,0,Chasing Fire,2018-03-29,...,1.0,6.0,114,english speaking and nordic,31 ContributorsTranslationsPortuguêsChasing Fi...,en,Austria,pop,"[fire, lyric, wanna, hate, cause, still, love,...",0.377025
3,19,https://open.spotify.com/track/1xy4apMFecGGkgB...,4149.0,chasing fire,Lauv,0,3587039.0,0,Chasing Fire,2018-03-29,...,1.0,6.0,54,english speaking and nordic,31 ContributorsTranslationsPortuguêsChasing Fi...,en,Australia,pop,"[fire, lyric, wanna, hate, cause, still, love,...",0.377025
4,4,https://open.spotify.com/track/1xy4apMFecGGkgB...,942.4,chasing fire,Lauv,0,3587039.0,0,Chasing Fire,2018-03-29,...,1.0,6.0,89,english speaking and nordic,31 ContributorsTranslationsPortuguêsChasing Fi...,en,Belgium,pop,"[fire, lyric, wanna, hate, cause, still, love,...",0.377025


In [None]:
sentiment_df_path = folder_path + 'spotify_with_sentiment.csv'
sentiment_df.to_csv(sentiment_df_path, index=False)

## LDA Topic Analysis

In [None]:
translated_data_path = folder_path + 'spotifydata_translated_combined.csv'
LDA_topic_df = pd.read_csv(translated_data_path)

print(f'Dim: {LDA_topic_df.shape}')

  LDA_topic_df = pd.read_csv(translated_data_path)


Dim: (122912, 35)


In [None]:
print(stopwords.fileids())

['albanian', 'arabic', 'azerbaijani', 'basque', 'belarusian', 'bengali', 'catalan', 'chinese', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hebrew', 'hinglish', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'tamil', 'turkish']


Add additional stop words to remove filler words that are not signifcant to the meaning of the lyrical data. Also clean up the text with the preprocess function.

In [None]:
# add more stop words (the section headers)
stop_words.extend(['yes', 'yah', 'ey', 'mi', 'oh', 'da', 'woah', 'mm',
                   'mmm', 'dum', 'hmm', 'ooh', 'la', 'ah', 'na', 'eh', 'uh',
                   'ha', 'ooh', 'ayy', 'em', 'woo', 'uh',
                   'got','wanna','gonna','gotta','gon','cause','way','right','say',
                   'keep', 'see', 'want', 'feel', 'let', 'come', 'still', 'back', 'tell'
                   , 'already', 'ya', 'hey', 'lo', 'aye', 'ba', 'beh', 'yeh', 'ddu', 'ay',
                   'ta', 'one', 'u', 'one', 'make', 'could', 'would', 'always'])
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    #removes section headers
    text = re.sub(r"\[[^\]]+\]", "", text)
    tokens = []

    for word in simple_preprocess(text, deacc=True):
        if word not in stop_words:
            tokens.append(lemmatizer.lemmatize(word))

    return tokens

In [None]:
LDA_topic_df['tokens'] = LDA_topic_df['lyrics'].apply(preprocess)

Save the LDA token csv to prevent retokenization

In [None]:
token_path = folder_path + 'spotify_lda_tokens.csv'
LDA_topic_df.to_csv(token_path)

LDA_topic_df = pd.read_csv(token_path)
LDA_topic_df['tokens'].isna().sum()

Create a corpus that consists of the tokenized works and how often they occur in the song.

In [None]:
# LDA_topic_df = LDA_topic_df.sample(5000)
LDA_topic_df['tokens'] = LDA_topic_df['tokens'].apply(ast.literal_eval)
dictionary = corpora.Dictionary(LDA_topic_df['tokens'])
dictionary.filter_extremes(no_below = 10, no_above=0.25)

corpus = []
for tokens in LDA_topic_df['tokens']:
    corpus.append(dictionary.doc2bow(tokens))

In [None]:
print(corpus[100])

Pass the corpus and our bag of words to LDA Model, then identify 8 clusters which will be our topics.

In [None]:
num_topics = 8
lda_model = LdaModel(corpus, num_topics, dictionary, passes = 10)

In [None]:
for i, topic in lda_model.show_topics(formatted=False, num_words=10):
    words = ", ".join([word for word, _ in topic])
    print(f"Topic {i}: {words}")

In [None]:
import numpy as np
import pandas as pd

topic_vecs = []

for bow in corpus:
    # get_document_topics
    doc_topics = lda_model.get_document_topics(bow, minimum_probability=0.0)
    # sort by topic_id
    probs = np.array([prob for _, prob in sorted(doc_topics, key=lambda x: x[0])])
    topic_vecs.append(probs)

topic_matrix = np.vstack(topic_vecs)

dominant_topics = topic_matrix.argmax(axis=1)  # array of ints in [0, num_topics)

# append back to the data frame with a column name, 'dominant_topic
LDA_topic_df = LDA_topic_df.reset_index(drop=True)
LDA_topic_df['dominant_topic'] = dominant_topics

print(LDA_topic_df[['lyrics','dominant_topic']].head())

## Combine Dataset


Merge the two datasets and download the resulting csv. It will then be cleaned and turned into numeric values in the Neural Network notebook.

In [None]:
# if the dataframes need to be reloaded, uncomment the following
sentiment_df_path = folder_path + 'spotify_with_sentiment.csv'
LDA_topic_df_path = folder_path + 'spotify_lda_topics.csv'
sentiment_df = pd.read_csv(sentiment_df_path)
LDA_topic_df = pd.read_csv(LDA_topic_df_path)

lang_analysis_df = pd.concat([LDA_topic_df, sentiment_df["Sentiment_Range"]], axis=1)

  sentiment_df = pd.read_csv(sentiment_df_path)
  LDA_topic_df = pd.read_csv(LDA_topic_df_path)


In [None]:
smaller_sentiment_path = folder_path  + '2_spotify_sentiment.csv'
smaller_sentiment_df = pd.read_csv(smaller_sentiment_path)

lang_analysis_df = pd.concat([lang_analysis_df, smaller_sentiment_df["senti_score"]], axis=1)

  smaller_sentiment_df = pd.read_csv(smaller_sentiment_path)


In [None]:
# drop the lyrics column as it is no longer needed
lang_analysis_df = lang_analysis_df.drop(columns=['lyrics'])
lang_analysis_df = lang_analysis_df.drop(columns=['Unnamed: 0'])
lang_analysis_df = lang_analysis_df.drop(columns=['LDA_Topic'])

lang_analysis_df.head()

In [None]:
saved_lang_analysis_path = folder_path + 'combined_lang_analysis_removed.csv'
lang_analysis_df.to_csv(saved_lang_analysis_path)

## Clean Dataset to Numeric Values

In [None]:
# uncomment to upload the saved data set if necessary
db = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/381 Final Project/ML Final Project/combined_lang_analysis_removed.csv')
db.columns

In [None]:
#7: explicit, 12: daneceability, 24: time signature 27: released after 2017
cols = ['Country', 'Popularity', 'Album/Single', 'Artist_followers', 'Explicit',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acoustics', 'instrumentalness', 'liveliness', 'valence', 'tempo',
       'duration_ms', 'time_signature', 'Genre', 'Days_since_release',
       'Released_after_2017', 'senti_score', 'Sentiment_Range', 'dominant_topic']
NN_df = db.loc[:, cols]
NN_df.columns

In [None]:

#mapping out correlations between variables for each country
countries = NN_df['Country'].unique()
numeric_cols = ['Popularity', 'Album/Single', 'Artist_followers',
       'Explicit', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acoustics', 'instrumentalness', 'liveliness', 'valence',
       'tempo', 'duration_ms', 'time_signature', 'Genre', 'Released_after_2017',
        'senti_score', 'Sentiment_Range', 'dominant_topic']
for x in numeric_cols:
  NN_df[x] = NN_df[x].astype(str).str.strip()
  NN_df[x] = NN_df[x].str.replace('(', '').str.replace(')', '')
  NN_df[x] = pd.to_numeric(NN_df[x], errors='coerce').fillna(0).astype(float)
df_corr_matrix = NN_df[numeric_cols].corr(method='spearman')
df_corr_matrix.style.background_gradient(cmap='Blues')

In [None]:
NN_df.dropna(inplace=True)

NN_df_path = folder_path + '(4) FINAL_CLEAN_SPOTIFY_DATA.csv'
NN_df.to_csv(NN_df_path)