In [15]:
import pandas as pd  # Import the pandas library for data manipulation (working with DataFrames and Series)

import numpy as np  # Import the numpy library for numerical computations (arrays, matrices, etc.)

from sklearn.feature_extraction.text import TfidfVectorizer  # Import TfidfVectorizer from scikit-learn for creating TF-IDF vector representations of text data

from sklearn.metrics.pairwise import cosine_similarity  # Import cosine_similarity function from scikit-learn for calculating similarity between vectors

import nltk  # Import the Natural Language Toolkit (NLTK) library for text processing tasks

from nltk.stem.porter import PorterStemmer  # Import PorterStemmer class from NLTK for stemming words (reducing them to their base form)

import pickle  # Import the pickle library for saving and loading Python objects

import requests  # Import the requests library for making HTTP requests (potentially for fetching song data)

from google.colab import drive  # Import functionality from Google Colab to mount your Google Drive

drive.mount('/content/drive')  # Mount your Google Drive to access data stored there (assuming you're using Google Colab)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
# Read data from CSV file
df = pd.read_csv('spotify_millsongdata.csv')

# Drop the 'link' column (assuming it's not relevant for recommendations)
df.drop('link', axis=1, inplace=True)

# Preprocess the 'text' column for better vectorization:
#   - Lowercase all characters
#   - Replace word boundaries with a single space
#   - Replace newlines with spaces (assuming each line represents a sentence or phrase)
df['text'] = df['text'].str.lower().replace(r'\w\s', ' ').replace(r'\n', ' ', regex=True)

# Sample 20,000 rows from the DataFrame (potentially for efficiency or memory limitations)
df = df.sample(20000).reset_index(drop=True)


In [17]:
# Tokenization function
stemmer = PorterStemmer()  # Create a PorterStemmer object for stemming words

def tokenization(text):
    """
    This function performs tokenization and stemming on a given text string.

    Args:
        text (str): The text string to be processed.

    Returns:
        str: The preprocessed text with tokens stemmed and joined back into a string.
    """

    # Tokenize the text into words using NLTK's word_tokenize
    tokens = nltk.word_tokenize(text)

    # Apply stemming to each token using the PorterStemmer
    stemming = [stemmer.stem(w) for w in tokens]

    # Join the stemmed tokens back into a single string with spaces
    return " ".join(stemming)


In [18]:
# Apply tokenization
df['text'] = df['text'].apply(tokenization)

In [19]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')
matrix = tfidf_vectorizer.fit_transform(df['text'])

In [20]:
# Compute cosine similarity
similarity = cosine_similarity(matrix)

In [21]:
# Recommendation function
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])), reverse=True, key=lambda x: x[1])

    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]]['song'])

    return songs# Recommendation function

In [22]:
# Create an empty dictionary to store similarity information
similarity_dict = {}

# Iterate through each row (song) in the DataFrame
for i, row in df.iterrows():
    """
    This loop calculates and stores similarity information for each song.

    Args:
        i (int): The index of the current row (song) being processed.
        row (pandas.Series): A single row (song) from the DataFrame.
    """

    # Get cosine similarity vector for the current song (row)
    similarity = cosine_similarity(df.loc[i, 'text'].reshape(1, -1), df['text'])[0]

    # Sort similarity indices in descending order (most similar first) and exclude the current song itself (index 0)
    sim_indices = np.argsort(similarity)[::-1][1:21]

    # Get corresponding cosine similarity scores for the top 20 similar songs
    sim_scores = [similarity[j] for j in sim_indices]

    # Create a list of tuples (index, similarity score) for top 20 similar songs
    similar_songs = list(zip(sim_indices, sim_scores))

    # Store the list of similar songs (tuples) in the dictionary with the current song as the key
    similarity_dict[row['song']] = similar_songs


In [23]:
similarity[0]

array([1.        , 0.00418043, 0.        , ..., 0.00363018, 0.00104286,
       0.01859492])

In [None]:
def get_album_cover(artist_name, song_name, api_key="YOUR_LAST_FM_API_KEY"):
    """
    Fetches album cover URL for a given song using Last.fm API.

    Args:
        artist_name (str): Name of the artist.
        song_name (str): Title of the song.
        api_key (str, optional): Your Last.fm API key. Defaults to "YOUR_LAST_FM_API_KEY".

    Returns:
        str: URL of the album cover image (if found), or error message otherwise.
    """

    url = f"https://ws.audioscrobbler.com/2.0/?method=track.getInfo&api_key={api_key}&artist={artist_name}&track={song_name}&format=json"
    response = requests.get(url)

    if response.status_code == 200:
        try:
            data = response.json()
            if 'track' in data and 'album' in data['track'] and 'image' in data['track']['album']:
                # Select the largest image URL from the Last.fm response
                album_cover_url = data['track']['album']['image'][-1]['#text']
                print(album_cover_url)
                return album_cover_url
            else:
                print("Album cover not found.")
                return "Album cover not found."
        except Exception as e:
            print(f"Error parsing JSON: {e}")
            return "Error parsing JSON."
    else:
        print(f"Error retrieving data from Last.fm API. Status code: {response.status_code}")
        return "Error retrieving data from Last.fm API."

# Replace 'YOUR_LAST_FM_API_KEY' with your actual Last.fm API key
api_key = "YOUR_LAST_FM_API_KEY"

# Add a new column 'img_url' to the DataFrame containing album cover URLs
df['img_url'] = df.apply(lambda row: get_album_cover(row['artist'], row['song'], api_key), axis=1)


In [24]:
# Save similarity dictionary
with open('similarity_dict.pkl', 'wb') as file:
    pickle.dump(similarity_dict, file)

# Save dataframe
with open('data.pkl', 'wb') as file:
    pickle.dump(df, file)