In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
tracks = pd.read_csv(r"datasets/data.csv")

In [3]:
tfidf = TfidfVectorizer( stop_words='english')

#Replace NaN with an empty string
tracks['name'] = tracks['name'].fillna('')

tracks = tracks.head(25000)

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(tracks['name'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape


(25000, 15701)

In [4]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [5]:

indices = pd.Series(tracks.index, index=tracks['name']).drop_duplicates()


In [6]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]

    track_indices = [i[0] for i in sim_scores]
    return tracks['name'].iloc[track_indices].tolist()

In [7]:
get_recommendations("I Might Fall Back On You")

['Fall Back Down',
 'Fall For You',
 'I Might Fall Back On You',
 'When I Fall In Love',
 'When I Fall In Love',
 'When I Fall In Love',
 'When I Fall In Love',
 'If I Ever Fall In Love',
 'If I Ever Fall In Love',
 'I Could Fall In Love']

In [8]:
from scipy.spatial.distance import cdist
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [9]:
tracks.to_csv('tracks_with_cluster.csv')

In [10]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="677a46ed628944af94b5cdb96e3e25ee",
                                                           client_secret="dfc3a58da7e34b009001ff1794592060"))

def find_song(name):
    song_data = defaultdict()
    results = sp.search(q= 'track: {}'.format(name), limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)


ModuleNotFoundError: No module named 'spotipy'

In [11]:
def get_song_data(song, spotify_data):
    
    """
    Gets the song data for a specific song. The song argument takes the form of a dictionary with 
    key-value pairs for the name and release year of the song.
    """
    
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) ].iloc[0]
        return song_data
    
    except IndexError:
        return find_song(song['name'])

In [33]:
def PCA_algorithm(tracks):
    num_datatypes = tracks.select_dtypes(np.number)      

    pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
    song_embedding = pca_pipeline.fit_transform(num_datatypes)
    return song_embedding

In [34]:
def KMeans_with_PCA_algorithm(song_embedding):
    kmeans_pca = KMeans(n_clusters = 3, init = 'k-means++', random_state=42)    
    return kmeans_pca.fit(song_embedding)

In [35]:
def filter_based_on_segment(df_segm_pca_kmeans,segment_val):    
    #get list of songs with above song id
    #print(df_segm_pca_kmeans[df_segm_pca_kmeans['Segment K-means PCA']==segment_val].values)
    
    filtered_data_per_segment = df_segm_pca_kmeans[df_segm_pca_kmeans['Segment K-means PCA']==segment_val][['id', 'name','com1','com2','Segment K-means PCA']]

    #save to csv file
    filtered_data_per_segment.to_csv('filtered_data_per_segment.csv')
    return filtered_data_per_segment

In [36]:
def filter_based_on_cluster_centroid(kmeans_pca,filtered_data_per_segment,segment_val,tracks):
    #calculate distance based on cluster centroid
    scaled_data = kmeans_pca.transform( filtered_data_per_segment[['com1','com2']])
    scaled_song_center = kmeans_pca.transform(kmeans_pca.cluster_centers_[segment_val].reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')

    #sort based on distance
    index = list(np.argsort(distances)[0])
    rec_songs = filtered_data_per_segment.iloc[index]
    
    #recommend based on segment data
    # rec_songs = rec_songs[~rec_songs['name'].isin(filtered_data_per_segment['name'])]
    return rec_songs

In [70]:
from sklearn.decomposition import PCA

def recommend_songs(song_id, n_songs=10):
  
    """
    Recommends songs based on a list of previous songs that a user has listened to.
    """
    song_embedding = PCA_algorithm(tracks)
    
    kmeans_pca = KMeans_with_PCA_algorithm(song_embedding)

    #get Segment K-means PCA from song_id argument
    df_segm_pca_kmeans = pd.concat([tracks.reset_index(drop=True), pd.DataFrame(song_embedding)],axis=1)
    df_segm_pca_kmeans.columns.values[-2:] = ['com1','com2']
    df_segm_pca_kmeans['Segment K-means PCA'] = kmeans_pca.labels_

    #get segment value of song_id
    segment_val = df_segm_pca_kmeans[df_segm_pca_kmeans['id'] == song_id]['Segment K-means PCA'].values[0]
    
    filtered_data_per_segment = filter_based_on_segment(df_segm_pca_kmeans,segment_val)   
    
    X = filtered_data_per_segment['com1']
    y = filtered_data_per_segment['com2']
    print(filtered_data_per_segment)

    rec_songs = filter_based_on_cluster_centroid(kmeans_pca,filtered_data_per_segment,segment_val,tracks)

    #recommend top n songs
    return rec_songs.head(n_songs)['name'].tolist(), X, y

In [71]:
rec_song, X, y = recommend_songs("3w3cxwYuR7ThpE8KVSys5x")



                           id                             name      com1  \
1      7xPhfUan2yNtyFG0cUWkt8          Clancy Lowered the Boom  1.050906   
9      08zfJvRLp7pjAb94MA9JmF                Il Etait Syndiqué  1.881899   
10     0BMkRpQtDoKjcgzCpnqLNa  Dans La Vie Faut Pas S'en Faire  2.171411   
14     0MJZ4hh60zwsYleWWxT5yW                   Power Is Power -0.690042   
18     0QQmUf4aPFaN9U2yRko595                      When We Die  0.335485   
...                       ...                              ...       ...   
24989  6MMF38qO03XJ7puIxVl8Gg         I Might Fall Back On You  0.777417   
24990  6SUXMuSUM01Ou6FRwKXyIc             Ella...La Que Se Fue  0.084604   
24994  6kQ6pAESTDPwq1ZWrNHU42         Life On the Wicked Stage  1.696521   
24998  7CZhyjk2lDW7A0lRsig45j    Where Will I Shelter My Sheep  0.999716   
24999  7DvMNdb2kACIW4mO3MUqa3                         Cherokee  0.874801   

           com2  Segment K-means PCA  
1     -2.821883                    1  
9     -2.

In [75]:
def listFrom1toN(n):
    return list(range(1,n+1))

In [100]:
X = np.array(X)
y = np.array(y)
print(len(X))
print(len(y))
result = np.vstack((X, y)).T

X = result

7662
7662


ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 2 and the array at index 1 has size 7662

In [90]:
print(result)
n = len(result)
print(n)
y = listFrom1toN(n)

[[ 1.05090562 -2.82188307]
 [ 1.88189933 -2.38520321]
 [ 2.17141059 -0.90915789]
 ...
 [ 1.69652077 -0.22653163]
 [ 0.99971613 -0.68764005]
 [ 0.87480095 -0.05408667]]
7662


In [18]:
# import plotly.express as px
# fig = px.scatter(projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
# fig.show()

In [19]:
# df = pd.read_csv(r"C:/Users/shris/Desktop/profiles.csv")
# print(df.gender)
# df.columns = ['userid', 'gender', 'age', 'country']

In [20]:
# df.head()

In [21]:
# songs = df.gender.tolist()
# output = {}
# outputs = []
# for song in songs:
#     k = recommend_songs(song)
#     output.update( {song : k})


In [22]:
# output

In [23]:
# df1 = pd.DataFrame.from_dict(output, orient='index')

In [None]:
from sklearn.datasets import make_classification
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Specify the number of clusters
n_clusters = 4

# Initialize the K-means model
km = KMeans(n_clusters=n_clusters, random_state=42)

# Fit the model to the training data
km.fit(X_train)

# Predict the cluster labels for the training data
train_labels = km.predict(X_train)

# Predict the cluster labels for the test data
test_labels = km.predict(X_test)

# Initialize the SVM model
svm = SVC(kernel='linear', random_state=42)

# Fit the SVM model to the training data using the cluster labels as features
svm.fit(train_labels.reshape(-1, 1), y_train)

# Predict the class labels for the test data using the cluster labels as features
y_pred = svm.predict(test_labels.reshape(-1, 1))

# Calculate the accuracy of the SVM model
accuracy = accuracy_score(y_test, y_pred)

print("The accuracy of the SVM model is {:.3f}".format(accuracy))