In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem.porter import PorterStemmer
import pickle
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
# Read data
df = pd.read_csv('spotify_millsongdata.csv')
df.drop('link', axis=1, inplace=True)
df['text'] = df['text'].str.lower().replace(r'\w\s', ' ').replace(r'\n', ' ', regex=True)
df = df.sample(20000).reset_index(drop=True)


In [17]:
# Tokenization function
stemmer = PorterStemmer()
def tokenization(text):
    tokens = nltk.word_tokenize(text)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [18]:
# Apply tokenization
df['text'] = df['text'].apply(tokenization)

In [19]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')
matrix = tfidf_vectorizer.fit_transform(df['text'])

In [20]:
# Compute cosine similarity
similarity = cosine_similarity(matrix)

In [21]:
# Recommendation function
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])), reverse=True, key=lambda x: x[1])

    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]]['song'])

    return songs# Recommendation function

In [22]:
# Store similarity information in a dictionary
similarity_dict = {}
for i, row in df.iterrows():
    sim_indices = np.argsort(similarity[i])[::-1][1:21]  # Indices of top 20 similar songs
    sim_scores = [similarity[i][j] for j in sim_indices]  # Corresponding cosine similarity scores
    similarity_dict[row['song']] = list(zip(sim_indices, sim_scores))

In [23]:
similarity[0]

array([1.        , 0.00418043, 0.        , ..., 0.00363018, 0.00104286,
       0.01859492])

In [24]:
# Save similarity dictionary
with open('similarity_dict.pkl', 'wb') as file:
    pickle.dump(similarity_dict, file)

# Save dataframe
with open('data.pkl', 'wb') as file:
    pickle.dump(df, file)

In [None]:
recommendation('')