In [2]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import seaborn as sns
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
merged_data = pd.read_csv('merged_data.csv')

In [14]:
merged_data.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1.0,The Cove,Thicker Than Water,Jack Johnson,0.0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2.0,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976.0
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1.0,Stronger,Graduation,Kanye West,2007.0
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1.0,Stronger,Graduation,Kanye West,2007.0
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODDNQT12A6D4F5F7E,5.0,Apuesta Por El Rock 'N' Roll,Antología Audiovisual,Héroes del Silencio,2007.0


In [5]:
user_song_matrix = merged_data.pivot_table(index='user_id', columns='song_id', values='listen_count', fill_value=0)

In [6]:
user_song_matrix = merged_data.pivot_table(index='user_id', columns='song_id', values='listen_count', fill_value=0)
user_song_sparse = csr_matrix(user_song_matrix)

In [7]:
def get_top_similar_users(user_idx, sparse_matrix, top_n=2000):
    user_vector = sparse_matrix[user_idx]
    similarities = cosine_similarity(user_vector, sparse_matrix).flatten()
    top_indices = np.argsort(-similarities)[1:top_n + 1]
    return top_indices, similarities[top_indices]

In [17]:
def recommend_songs(user_id, user_song_matrix, num_recommendations=5):
    if user_id not in user_song_matrix.index:
        return f"User ID '{user_id}' not found."

    user_idx = user_song_matrix.index.get_loc(user_id)
    similar_users, _ = get_top_similar_users(user_idx, user_song_sparse)

    similar_users_songs = user_song_sparse[similar_users].sum(axis=0).A1
    user_songs = user_song_sparse[user_idx].toarray().flatten()
    recommendations = np.argsort(-similar_users_songs + user_songs)[:num_recommendations]
    recommended_song_ids = user_song_matrix.columns[recommendations]


    return merged_data[merged_data['song_id'].isin(recommended_song_ids)][['title', 'artist_name']].drop_duplicates()

In [18]:
user_id = 'b80344d063b5ccb3212f76538f3d9e43d87dca9e'
recommendations = recommend_songs(user_id, user_song_matrix)
print("Recommended Songs:")
print(recommendations)

Recommended Songs:
                                                title  \
7                                       Sehr kosmisch   
81                                               Undo   
84                                     You're The One   
87  Horn Concerto No. 4 in E flat K495: II. Romanc...   
97                                            Revelry   

                                          artist_name  
7                                            Harmonia  
81                                              Björk  
84                                      Dwight Yoakam  
87  Barry Tuckwell/Academy of St Martin-in-the-Fie...  
97                                      Kings Of Leon  


In [11]:
def user_top_listens(user_id, merged_data):

    user_data = merged_data[merged_data['user_id'] == user_id]


    top_songs = user_data.groupby(['title', 'artist_name'])['listen_count'].sum().reset_index()
    top_songs = top_songs.sort_values(by='listen_count', ascending=False).head(10)

    return top_songs

In [15]:
user_id = 'b80344d063b5ccb3212f76538f3d9e43d87dca9e'
top_listens = user_top_listens(user_id, merged_data)


print(f"Top listens for user {user_id}:")
print(top_listens)

Top listens for user b80344d063b5ccb3212f76538f3d9e43d87dca9e:
                                                title          artist_name  \
22                                          Moonshine         Jack Johnson   
4                    Behind The Sea [Live In Chicago]   Panic At The Disco   
2                        Apuesta Por El Rock 'N' Roll  Héroes del Silencio   
17  I'll Be Missing You (Featuring Faith Evans & 1...           Puff Daddy   
11                                    Entre Dos Aguas        Paco De Lucia   
15                                       High and dry        Jorge Drexler   
31                                           Stronger           Kanye West   
29                                      Sehr kosmisch             Harmonia   
24                                              Oh No          Andrew Bird   
25                    Our Swords (Soundtrack Version)       Band Of Horses   

    listen_count  
22           8.0  
4            6.0  
2            5.0  
17

In [19]:
def evaluate_recommendations(user_id, recommendations, merged_data):

    user_history = merged_data[merged_data['user_id'] == user_id]
    user_listened_songs = user_history['title'].unique()

    # Check if recommendations overlap with user's listening history
    recommended_titles = recommendations['title'].values
    relevant_recommendations = [title for title in recommended_titles if title in user_listened_songs]

    # Precision: Fraction of recommendations that are relevant
    precision = len(relevant_recommendations) / len(recommended_titles) if len(recommended_titles) > 0 else 0

    # Recall: Fraction of user's listened songs that are recommended
    recall = len(relevant_recommendations) / len(user_listened_songs) if len(user_listened_songs) > 0 else 0

    # F1-Score: Harmonic mean of Precision and Recall
    f1_score = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0

    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'relevant_recommendations': relevant_recommendations
    }

user_id = '39045c552b71dc90bad828df1801676b82901923'
recommendations = recommend_songs(user_id, user_song_matrix)


# Load merged_data (make sure it's preloaded)
results = evaluate_recommendations(user_id, recommendations, merged_data)

# Display results
print(f"Accuracy for User {user_id}:")
print(f"Precision: {results['precision']:.2f}")
print(f"Recall: {results['recall']:.2f}")
print(f"F1-Score: {results['f1_score']:.2f}")
print(f"Relevant Recommendations: {results['relevant_recommendations']}")


TypeError: string indices must be integers