# AE1 - Working with Text

## Cleaning Data

In [None]:
# Loading and cleaning data

import pandas as pd
import numpy as np
import nltk
import sys
import re
import requests
import random
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud

from nltk.corpus import stopwords # Importing stopwords
from nltk.tokenize import word_tokenize # Importing the word tokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer # VADER Sentiment Analyzer
import nltk.data
# Download resources for cleaning
nltk.download('punkt') # For tokenization
nltk.download('stopwords') # For stopwords'
nltk.download('vader_lexicon')

# Import dataset
musicDataDF = pd.read_csv('spotify_millsongdata.csv')  

# Remove unnecessay columns (link)
musicDataDF = musicDataDF.drop(columns=['link'], axis = 1)

musicDataDFCleaned = musicDataDF.copy()

# Removing special characters, numbers, and unnecessary whitespace for each column and making lowercase
for column in list(musicDataDFCleaned.columns.values):
    # Only preserve letters and whitespace
    musicDataDFCleaned[column] = musicDataDFCleaned[column].str.replace(r'[^a-zA-Z\s]', '', regex=True)
    # Removing leading or trailing whitespaces
    musicDataDFCleaned[column] = musicDataDFCleaned[column].apply(lambda x: x.strip()) 
    # Normalize all text to lowercase
    musicDataDFCleaned[column] = musicDataDFCleaned[column].str.lower() 

# Removing meaningless words
stopwords = stopwords.words('english') #I.e. 'I', 'You', 'Your', etc.
stopwords = stopwords + ['refrain', 'chrous', 'verse', 'oh', 'ooh', 'ah', 'im', 'la', 'yeah', 'na', 'dont']

def remove_words(text, words_to_remove):
    words = text.split()
    filtered_words = [word for word in words if word not in words_to_remove]
    return ' '.join(filtered_words)

# Apply the remove_words function to the 'lyrics' column
musicDataDFCleaned['text'] = musicDataDFCleaned['text'].apply(lambda x: remove_words(x, stopwords))


## Analyzing and Visualizing Text

In [None]:
artistInput = input("Please enter one or more artists (seperated by commas) to view their word usage and general lyrical sentiment. ").lower()

# Removes special chars in case user used wrong delimiters
bad_chars = [';', ':', ".", '/','|', ', ']
for char in bad_chars:
    artistInput = artistInput.replace(char, ',')
    
artists = artistInput.split(',')   

for artist in artists:
    if artist not in musicDataDFCleaned['artist'].values:
        print(f'Sorry there is no data for {artist}.')
        sys.exit()

#Initialize the sentiments and the model
compound = []

sid = SentimentIntensityAnalyzer()

#Iterate for each row of lyrics
for i in musicDataDFCleaned.index:
    # Append the sentiment scores
    scores = sid.polarity_scores(musicDataDFCleaned['text'].iloc[i])
    compound.append(scores['compound'])
    
# Classify sentiment based on compound score
def categorize_sentiment(compoundScore):
    if compoundScore > 0.5:
        return 'positive'
    elif compoundScore < -0.5:
        return 'negative'
    else:
        return 'neutral'
    
#Create 2 columns to the main dataframe for each score
musicDataDFCleaned['compound'] = compound
# Classifying song column
musicDataDFCleaned['sentiment'] = musicDataDFCleaned['compound'].apply(categorize_sentiment)

def get_word_frequency(lyrics):
    tokens = word_tokenize(lyrics)
    word_freq = Counter(tokens)
    return word_freq

# Group by artist and aggregate lyrics
grouped = musicDataDFCleaned.groupby('artist')['text'].apply(lambda x: ' '.join(x))

# Calculate word frequency
word_frequencies = {}
for musician, lyrics in grouped.items():
    word_frequencies[musician] = get_word_frequency(lyrics)

# Printing all output analysis
for artist in artists:
    # Printing word frequency
    wordFreqList = word_frequencies[artist].most_common(10)
    vocabRichness = len(word_frequencies[artist])
    print(f'\nThe top 10 most used words by {artist.title()} are: \n{wordFreqList}')
    print(f"{artist.title()}'s vocabulary richness (unique words used) was {vocabRichness}")
    
    # wordCloud initialization
    wordcloud = WordCloud(width = 200, height = 200,
                background_color ='white',
                min_font_size = 5).generate(grouped.loc[artist])

    # plot the WordCloud image                       
    plt.figure(figsize = (4, 4), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)
    plt.show()
    
    # Histogram to accompany wordcloud
    words = [tup[0] for tup in wordFreqList]
    frequencies = [tup[1] for tup in wordFreqList]
    plt.figure(figsize=(6, 3))
    plt.bar(words, frequencies)
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.title('Word Frequency Histogram')
    plt.xticks(rotation=90)  
    plt.tight_layout()  
    plt.show()
    
    # Filter the DataFrame for the specific artist
    artist_df = musicDataDFCleaned[musicDataDFCleaned['artist'] == artist]

    # Count sentiment scores for the specific artist
    sentimentScores = artist_df['sentiment'].value_counts()
    print(f"\nThe sentiments for {artist.title()}'s songs are: \n{sentimentScores}")

    # Plotting
    plt.figure(figsize=(6, 3))
    sentimentScores.plot(kind='bar', color='skyblue')
    
    plt.title('Number of Songs by Sentiment')
    plt.xlabel('Sentiment')
    plt.ylabel('Song Count')
    plt.xticks(rotation=0)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    
    plt.tight_layout()
    plt.show()

### Comparing Artists

In [None]:
# Comparison of the artists
print('Comparing artists:')
overlapWordList = []
artistWordFreqDict = {}
artistSentimentScoreDict = {}

# Initalizing the respective sentiment scores and word frequency lists
for artist in artists:
    # Filling artist and word frequency dict
    artistWordFreqDict[artist] = word_frequencies[artist].most_common(10)
    overlapWordList = [item[0] for item in artistWordFreqDict[artist]] # Initalizes list so when the intersection is taken, the list is not empty
    
    # Sentiment scores
    artist_df = musicDataDFCleaned[musicDataDFCleaned['artist'] == artist]
    artistSentimentScoreDict[artist] = artist_df['sentiment'].value_counts()

# Taking the intersection of word freq lists to print overlap
for artist, wordFreq in artistWordFreqDict.items():
    # Taking just the values of the word frequency lists
#     overlapWordList = [item[0] for item in overlapWordList]
    wordFreq = [item[0] for item in wordFreq]
    
    # Taking intersection
    overlapWordList = set(overlapWordList).intersection(wordFreq)
    
if len(overlapWordList) > 0:
    print(f'The overlapping commonly used words between your artists are: {overlapWordList}')
else:
    print("There were no overlapping commonly used words between your artists")
    
# Visualizing overlap

# Extract words and counts from the lists
words1, counts1 = zip(*artistWordFreqDict[artists[0]])
words2, counts2 = zip(*artistWordFreqDict[artists[1]])

# Create a set of all unique words
all_words = set(words1).union(set(words2))

# Create a matrix
matrix = np.zeros((len(all_words), len(all_words)))

# Fill in the matrix with counts
for i, word1 in enumerate(all_words):
    for j, word2 in enumerate(all_words):
        count1 = counts1[words1.index(word1)] if word1 in words1 else 0
        count2 = counts2[words2.index(word2)] if word2 in words2 else 0
        matrix[i][j] = count1 + count2

# Create a heatmap
sns.set(font_scale=0.5)
plt.figure(figsize=(4, 3))
sns.heatmap(matrix, annot=True, fmt='g', cmap='YlGnBu', xticklabels=list(all_words), yticklabels=list(all_words))
plt.xlabel(f'Lyrics for {artists[0].title()}')
plt.ylabel(f'Lyrics for {artists[1].title()}')
plt.title('Heatmap of Word Frequency Counts')
plt.show()
    
# Analzying the correlation of sentiment values
if len(artists) == 2: # Only works if user inputted 2 artists
    correlationValue = artistSentimentScoreDict[artists[0]].corr(artistSentimentScoreDict[artists[1]])
    if correlationValue > 0.4:
        print(f'Your artists have a very similar sentiment score. In fact, their correlation is {round(correlationValue, 2)}')
    elif correlationValue > -0.4:
        print("There is no strong correlation between your artists' sentiment scores")
    else:
        print(f'Your artists have an opposite sentiment score. In fact, their correlation is {round(correlationValue, 2)}')
else:
    print("Only enter 2 artists if you want to see their sentiment score correlation.")

## Lyric Generation - N-gram and Markov Chaining

In [None]:
# Markov Chaining Lyric Generation

# Cleaning data in different fashion

# List of String Columns
string_columns = musicDataDF.select_dtypes(include='object').columns
string_columns.to_list()

# Removing special characters, numbers, and unnecessary whitespace for each column and making lowercase
for column in string_columns:
    # Only preserve letters and whitespace
    musicDataDF[column] = musicDataDF[column].str.replace(r'[^a-zA-Z\s]', '', regex=True)
    # Removing leading or trailing whitespaces
    musicDataDF[column] = musicDataDF[column].apply(lambda x: x.strip()) 
    
musicDataDF['artist'] = musicDataDF['artist'].str.lower() 
    
artistInput = input("Please enter one or more artists (seperated by commas) to generate a new song for. ").lower()

# Removes special chars in case user used wrong delimiters
bad_chars = [';', ':', ".", '/','|', ', ']
for char in bad_chars:
    artistInput = artistInput.replace(char, ',')
    
artists = artistInput.split(',')
for artist in artists:
    if artist not in musicDataDFCleaned['artist'].values:
        print(f'Sorry there is no data for {artist}.')
        sys.exit()
        
# Not using the cleaned version because we need all lyrics for generation
groupedAllLyrics = musicDataDF.groupby('artist')['text'].apply(lambda x: ' '.join(x))

# Gathering all lyrics
totalLyrics = ''
for artist in artists:
    totalLyrics += (" " + groupedAllLyrics.loc[artist])

totalLyrics = totalLyrics.split() 

markovOrN_Gram = input('Would you like a Markov Chaining based song or N-Gram based song? ').lower()
numOfSongs = int(input('Please enter how many songs you want produced. '))
songLength = int(input('Please enter how many words you want your songs to be. '))


if 'markov' in markovOrN_Gram: # Markov chaining method

    def nextWord(currentSong, lyricBank):
        nextOptions = []
        for i in range(len(currentSong)):
            for j in range(len(lyricBank)):
                if lyricBank[j:j+i+1] == currentSong[(0-i)-1:]:
                    try:
                        nextOptions.append(lyricBank[j + 1]) # Error catching for indexOutOfBounds. In case the word of the song being tested is the last word in lyricBank
                    except IndexError:
                        nextOptions.apppend('oh') # Filler
        return str((np.random.choice(nextOptions)))

    for i in range(numOfSongs):
        print(f'Song {i+1}:')
        first_word = np.random.choice(totalLyrics)

        while first_word.islower():
            first_word = np.random.choice(totalLyrics)

        chain = [first_word]

        n_words = songLength

        for i in range(n_words):
            newWord = nextWord(chain, totalLyrics)
            if newWord != newWord.lower() and newWord != 'I':
                chain.append("\n")
            chain.append(newWord)

        print(' '.join(chain))
        print('\n')

else:
    numOfGrams =int(input('Please enter how many grams you want used. '))
    
    # Starting words, that would not be capital anyway
    seeds = [word for word in totalLyrics if word.istitle() and word not in ["I", "Id", "Ive", "God"]]

    def nGrams(words, n):
        ngramDict = {}

        # Generate n-grams
        for i in range(len(words) - n):
            keyWord = words[i]
            nextNgram = tuple(words[i+1:i+n+1])

            if keyWord in ngramDict:
                ngramDict[keyWord].append(nextNgram)
            else:
                ngramDict[keyWord] = [nextNgram]

        return ngramDict
    
        
    def generateSong(seeds, songLength, ngrams):
        sentence = ""
        curr_word = str(random.choice(seeds))
        wordCount = 1
        while wordCount < songLength:
            
            nextTuple = random.choice(ngrams[curr_word])
            if nextTuple[0] != nextTuple[0].lower():
                sentence += '\n'
            nextStr = ' '.join(nextTuple)
            sentence = sentence + ' ' + nextStr
            
            wordCount += 1
        return sentence
    
    for i in range(numOfSongs): 
        print(f'Song {i+1}:')
        print(generateSong(seeds, songLength, nGrams(totalLyrics, numOfGrams)))
        print('\n')

## Above and Beyond

### Song recommender

In [None]:
# Song recommender

# based on key words, sentiments of inputted song(s)
print('Please format your responce like: Bang by Abba, Hammer to Fall by Queen, etc.')
songsAndArtists = input('Please input one or more songs and their artists you like to recieve similar recommendations. ').lower()

# Removes special chars in case user used wrong delimiters
bad_chars = [';', ':', ".", '/','|', ', ']
for char in bad_chars:
    songsAndArtists = songsAndArtists.replace(char, ',')

# Creating an empty dictionary with song, artist pairs
songsAndArtists = songsAndArtists.split(',')
songsDict = {}

# Iterate over each pair, split it by 'by' and store in the dict
for pair in songsAndArtists:
    song, artist = pair.split(' by ')
    # check validity
    if (artist not in musicDataDFCleaned['artist'].values) or (song not in musicDataDFCleaned['song'].values):
        print(f'Sorry there is no data for that song.')
        sys.exit()
    songsDict[song] = artist

# Get top 15 most common words from a string
def get_top_15_words(words):
    word_counts = Counter(words)
    # Get the top 15 most common words
    top_15_words = word_counts.most_common(15)
    # Return only the words, not the counts
    return top_15_words

# Creating new column with the top 15 words used in each song
musicDataDFCleaned['top words'] = musicDataDFCleaned['text'].apply(lambda x: get_top_15_words(x.split()))

sentimentScores = []
commonWords = []

# Checking what sentiment to look for
for song, artist in songsDict.items():
    rowIndex = (musicDataDFCleaned['song'] == song) & (musicDataDFCleaned['artist'] == artist)
    desiredRow = musicDataDFCleaned[rowIndex]
    sentimentScores.append(desiredRow['sentiment'].values)
    commonWords = commonWords + list(desiredRow['top words'].values[0])

# get most common sentiment
def getSentiment(sentimentsList):
    pos = 0
    neg = 0
    neu = 0
    for sent in sentimentsList:
        if sent == 'positive':
            pos += 1
        elif sent == 'negative':
            neg += 1
        else:
            neu += 1
    if pos > neg:
        return 'positive'
    elif neg > pos:
        return 'negative'
    else:
        return 'neutral'

# check overlapping words between artists
def numOverlappingWords(setWords, wordsToCompare):
    intersection = list(set(setWords) & set(wordsToCompare))
    return len(intersection)

sentiment = getSentiment(sentimentScores)
maxWordOverlap = 0
currentSong = ''
currentArtist = ''

for index, row in musicDataDFCleaned.iterrows():
    if row['sentiment'] == sentiment: # if the artist's sentiments match
        if numOverlappingWords(commonWords, row['top words']) > maxWordOverlap and row['song'] not in songsDict:
            maxWordOverlap = numOverlappingWords(commonWords, row['top words'])
            currentSong = row['song']
            currentArtist = row['artist']

print(f'I recommend listening to {currentSong.title()} by {currentArtist.title()}')


### Adding Correlations

In [None]:
# New Data
albumDataDF = pd.read_csv('albumsSold.csv')

# Cleaning Data
albumDataDF = albumDataDF.drop(columns=['Artist ID'], axis = 1) # Remove unnecessay column
albumDataDF['Artist'] = albumDataDF['Artist'].str.lower() # Making artist name lowercase
albumDataDF.rename(columns={'Artist': 'artist'}, inplace=True)
# Changing Datatype
columnsConvert = ['Certified Units', 'Gold', 'Platinum', 'Multi-Platinum', 'Diamond']
albumDataDF[columnsConvert] = albumDataDF[columnsConvert].astype(int)

# Merging Dataframes
musicAndAlbumsDF = pd.merge(musicDataDFCleaned, albumDataDF, on='artist', how='inner')

# Get sum of albums sold for each sentiment score
sentiment_counts = musicAndAlbumsDF['sentiment'].value_counts()
sentimentAlbumsGrouped = musicAndAlbumsDF.groupby('sentiment')['Certified Units'].sum().reset_index()

# Normalize albums sold by the total count of songs for each sentiment category
sentimentAlbumsGrouped['normalizedAlbumsSold'] = sentimentAlbumsGrouped.apply(lambda row: row['Certified Units'] / sentiment_counts[row['sentiment']], axis=1)

correlation_coefficient = np.corrcoef(sentiment_counts.values[::-1], sentimentAlbumsGrouped['normalizedAlbumsSold'].values)[0, 1]

# Plot the barchart and correlation 
plt.figure(figsize=(10, 6))
sns.barplot(data=sentimentAlbumsGrouped, x='sentiment', y='normalizedAlbumsSold', palette='coolwarm')
plt.title(f'Correlation between Sentiment Score Value Counts and Albums Sold \nCorrelation Coefficient: {correlation_coefficient:.2f}', fontsize = 11)
plt.xlabel('Sentiment Score', fontsize = 9)
plt.ylabel('Normalized Albums Sold', fontsize = 9)
plt.grid(True)
plt.show()

## Reflection

I really enjoyed completing this project. It was interesting to compare the sentiment scores and word usage from different artists, particularly across genres. It was also fun to see lyric generation and song recommendations. I listened to the recommendations against the compared song and actually found them to be very similar. It was gratifying to see successful results from a simple idea (comparing sentiment scores and vocabulary richness/usage).

The main challenge I encountered during the project was dealing with complicated data structures. Throughout the project, I used layers of abstraction to create efficient code at the expense of readability and simplicity. For example, I created a dictionary with a list full of tuples as values. Therefore, I had to index 4 times to get the value with the tuple. This strategy has fast lookup time but was slightly confusing to code and produced many Index and Type Errors.

To solve this, I clearly outlined my code with comments and printed each 'type' of data structure throughout my code for debugging purposes. This helped me understand the different object types I was using and gave me ideas on how to solve other problems based on their unique qualities. For example, when finding the number of overlapping words between 2 songs, I realized I had 2 lists in the dataframe of the top words. So, I could use the built-in 'intersection' method to find the overlap. 'Intersection' returns another list, so I could simply return the 'len' of the result to find the number of overlapping words. This task may not have been as easy if I was using numpy arrays or a queue/stack. Furthermore, when determining vocabulary richness, I realized my dictionary with words as keys and their frequencies as values was also the length of all unique words so I could use the 'len' function again.

Throughout the project, I learned to write more generalized code. Early on, I was struggling to write code that could work for any number of artists and was hard-coding solutions with the prior knowledge that I would likely have 2 artists as input. However, as I kept working, I discovered techniques (like using dictionaries within for each loops) to generalize my code (comparison, lyric generation, and song recommendation) to work for any number of artists or songs.

I also learned to take a wholly different approach when debugging rather than making small changes. For example, the n-gram method of lyric generation produced somewhat incomprehensible songs, so instead of trying to optimize it I implemented a Markov chaining method of lyric generation. I also believe it makes slightly better songs because it does not get 'stuck' in loops or repeat lyrics as often as n-grams.

For future iterations of this project, I would recommend adding web-scraping as an aspect to look for more information on each artist. I briefly attempted to implement it to check for artist's lyrics that were not in the dataset. I was able to scrape age for individual artists off Wikipedia, but unfortunately, I could not find a generalized site with extensive data that I could use for any artist, so it didn't work well enough. However, in the future, perhaps there could be a site that finds the genre, age, ethnicity, usual tempo, etc. of the artist so there is more data to compare them.
