# IMPORTED LIB

In [56]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from tqdm import tqdm
from spotipy.oauth2 import SpotifyOAuth



# LOADING CSV FILES

In [55]:

songs_df = pd.read_csv('D:\\snap_tune_final\\snap_tune_project\\archive\\songs_with_audio_feature.csv')  # Song data
artists_df = pd.read_csv('D:\\snap_tune_final\\snap_tune_project\\archive\\artists.csv')  # Artist data
albums_df = pd.read_csv('D:\\snap_tune_final\\snap_tune_project\\archive\\albums.csv')  # Album data

# SONGS CSVS ANALYSIS

In [3]:
songs_df.head()

Unnamed: 0,track_id,track_name,album_id,artist_ids,artist_names,valence,year,acousticness,danceability,duration_ms,...,explicit,instrumentalness,key,liveness,loudness,mode,popularity,release_date,speechiness,tempo
0,4BJqT0PrAfrxzMOxytFOIz,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",6UKhQXIboybG7JPka28bZR,"['0Kekt6CKSo0m5mivKcoH51', '4qFQgEF1rg6a9WvJM0...","['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.0594,1921,0.982,0.279,831667,...,0,0.878,10,0.665,-20.096,1,4,1921,0.0366,80.954
1,7xPhfUan2yNtyFG0cUWkt8,Clancy Lowered the Boom,4LePw1NceheT4selvgH0cE,['0y60qXGlDixlFJcjUU6pwY'],['Dennis Day'],0.963,1921,0.732,0.819,180533,...,0,0.0,7,0.16,-12.441,1,5,1921,0.415,60.936
2,1o6I8BglA6ylDMrIELygv1,Gati Bali,3kjFZyMpkcTARMXuLrpipj,['7u5sCPrTVbLMZMlogVRZc0'],['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.0394,1921,0.961,0.328,500062,...,0,0.913,3,0.101,-14.85,1,5,1921,0.0339,110.339
3,3ftBPsC5vPBKxYSee08FDH,Danny Boy,4LePw1NceheT4selvgH0cE,['1gY7W6RY3EHWRWSv9Dhjqa'],['Frank Parker'],0.165,1921,0.967,0.275,210000,...,0,2.8e-05,5,0.381,-9.316,1,3,1921,0.0354,100.109
4,4d6HGyGT8e121BsdKmw9v6,When Irish Eyes Are Smiling,4LePw1NceheT4selvgH0cE,['08egOYfLsuZvkG0jFN7nUj'],['Phil Regan'],0.253,1921,0.957,0.418,166693,...,0,2e-06,3,0.229,-10.096,1,2,1921,0.038,101.665


In [4]:
songs_df.columns

Index(['track_id', 'track_name', 'album_id', 'artist_ids', 'artist_names',
       'valence', 'year', 'acousticness', 'danceability', 'duration_ms',
       'energy', 'explicit', 'instrumentalness', 'key', 'liveness', 'loudness',
       'mode', 'popularity', 'release_date', 'speechiness', 'tempo'],
      dtype='object')

In [5]:
songs_df['artist_ids'].apply(type).value_counts()


artist_ids
<class 'str'>    35200
Name: count, dtype: int64

In [6]:
songs_df['artist_names']

0        ['Sergei Rachmaninoff', 'James Levine', 'Berli...
1                                           ['Dennis Day']
2        ['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...
3                                         ['Frank Parker']
4                                           ['Phil Regan']
                               ...                        
35195                                      ['Evanescence']
35196                                  ['Alberto Pedraza']
35197                                  ['Marques Houston']
35198                                      ['Linkin Park']
35199                                     ['Jack Johnson']
Name: artist_names, Length: 35200, dtype: object

In [7]:
songs_df.shape

(35200, 21)

In [7]:
songs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35200 entries, 0 to 35199
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          35200 non-null  object 
 1   track_name        35200 non-null  object 
 2   album_id          35200 non-null  object 
 3   artist_ids        35200 non-null  object 
 4   artist_names      35200 non-null  object 
 5   valence           35200 non-null  float64
 6   year              35200 non-null  int64  
 7   acousticness      35200 non-null  float64
 8   danceability      35200 non-null  float64
 9   duration_ms       35200 non-null  int64  
 10  energy            35200 non-null  float64
 11  explicit          35200 non-null  int64  
 12  instrumentalness  35200 non-null  float64
 13  key               35200 non-null  int64  
 14  liveness          35200 non-null  float64
 15  loudness          35200 non-null  float64
 16  mode              35200 non-null  int64 

In [None]:
#check for null values 

print(songs_df.isnull())

# Step 3: Get the count of nulls in each column
print(songs_df.isnull().sum())

       track_id  track_name  album_id  artist_ids  artist_names  valence  \
0         False       False     False       False         False    False   
1         False       False     False       False         False    False   
2         False       False     False       False         False    False   
3         False       False     False       False         False    False   
4         False       False     False       False         False    False   
...         ...         ...       ...         ...           ...      ...   
35195     False       False     False       False         False    False   
35196     False       False     False       False         False    False   
35197     False       False     False       False         False    False   
35198     False       False     False       False         False    False   
35199     False       False     False       False         False    False   

        year  acousticness  danceability  duration_ms  ...  explicit  \
0      False   

# Artists csvs 

In [8]:
artists_df

Unnamed: 0,artist_id,artist_name,genres,followers,popularity
0,0Kekt6CKSo0m5mivKcoH51,Sergei Rachmaninoff,"['classical', 'post-romantic era', 'russian ro...",753124,65
1,4qFQgEF1rg6a9WvJM0MQIa,James Levine,"['classical performance', 'opera']",9206,53
2,6uRJnvQ3f8whVnmeoecv5Z,Berliner Philharmoniker,"['classical', 'classical performance', 'german...",223580,73
3,2DG9aIMzcln3w7SIVGGnmg,Arcadi Volodos,"['classical piano', 'russian classical piano']",17186,47
4,0y60qXGlDixlFJcjUU6pwY,Dennis Day,[],684,7
...,...,...,...,...,...
14634,1eNJZfAHoq5zu1sUukbhAe,Carlos Lafuente,['orquesta tipica'],84,3
14635,0mhWWtpVIh60Ney5hbeR93,Orquesta Víctor Popular,[],0,0
14636,01eq7T47JHOrYY0nfpdDYT,Serafim Gerotheodorou,[],2,0
14637,0nRShN4m4GvQv1O4igorb6,Ernesto Fama,['vintage tango'],1363,19


In [9]:
artists_df.columns

Index(['artist_id', 'artist_name', 'genres', 'followers', 'popularity'], dtype='object')

In [81]:
artists_df['artist_id']

0        0Kekt6CKSo0m5mivKcoH51
1        4qFQgEF1rg6a9WvJM0MQIa
2        6uRJnvQ3f8whVnmeoecv5Z
3        2DG9aIMzcln3w7SIVGGnmg
4        0y60qXGlDixlFJcjUU6pwY
                  ...          
14634    1eNJZfAHoq5zu1sUukbhAe
14635    0mhWWtpVIh60Ney5hbeR93
14636    01eq7T47JHOrYY0nfpdDYT
14637    0nRShN4m4GvQv1O4igorb6
14638    4klba9nOUmD5XsUVjlAFM5
Name: artist_id, Length: 14639, dtype: object

In [13]:
artists_df.shape

(14639, 5)

In [14]:
artists_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14639 entries, 0 to 14638
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   artist_id    14639 non-null  object
 1   artist_name  14639 non-null  object
 2   genres       14639 non-null  object
 3   followers    14639 non-null  int64 
 4   popularity   14639 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 572.0+ KB


In [15]:
artists_df.describe()

Unnamed: 0,followers,popularity
count,14639.0,14639.0
mean,4038699.0,54.615479
std,11389020.0,23.378139
min,0.0,0.0
25%,35269.0,45.0
50%,634550.0,60.0
75%,2978840.0,71.0
max,123544800.0,100.0


In [16]:
print(artists_df.isnull())

# Step 3: Get the count of nulls in each column
print(artists_df.isnull().sum())

       artist_id  artist_name  genres  followers  popularity
0          False        False   False      False       False
1          False        False   False      False       False
2          False        False   False      False       False
3          False        False   False      False       False
4          False        False   False      False       False
...          ...          ...     ...        ...         ...
14634      False        False   False      False       False
14635      False        False   False      False       False
14636      False        False   False      False       False
14637      False        False   False      False       False
14638      False        False   False      False       False

[14639 rows x 5 columns]
artist_id      0
artist_name    0
genres         0
followers      0
popularity     0
dtype: int64


# ALBUMS CSVS

In [17]:
albums_df

Unnamed: 0,album_id,album_name,release_date,total_tracks,album_type,album_cover_64x64,album_cover_640x640
0,6UKhQXIboybG7JPka28bZR,The Rachmaninoff Collection,2006-06-27,11,compilation,https://i.scdn.co/image/ab67616d000048515a7c6e...,https://i.scdn.co/image/ab67616d000048515a7c6e...
1,4LePw1NceheT4selvgH0cE,Original Irish Tenors: The Legendary Voices Of...,2006-02-20,21,compilation,https://i.scdn.co/image/ab67616d00004851ab0547...,https://i.scdn.co/image/ab67616d00004851ab0547...
2,3kjFZyMpkcTARMXuLrpipj,Gendhing Gati,1921,17,album,https://i.scdn.co/image/ab67616d000048512f1bfe...,https://i.scdn.co/image/ab67616d000048512f1bfe...
3,59DmiBZi588gd8yklEgIlS,Rachmaninoff: Greatest Hits,2009-03-27,11,compilation,https://i.scdn.co/image/ab67616d0000485125dab7...,https://i.scdn.co/image/ab67616d0000485125dab7...
4,4PSwdsOtNdinZE8mijH4E6,"Colección Completa, Vol. 2 (Remasterizado)",1921-03-20,21,album,https://i.scdn.co/image/ab67616d00004851fc669e...,https://i.scdn.co/image/ab67616d00004851fc669e...
...,...,...,...,...,...,...,...
11663,5nk0pXd9S0igSZGjudpB4p,"Música Boliviana, 78 Rpm Recordings, Vol. 1 (1...",1930-01-01,26,compilation,https://i.scdn.co/image/ab67616d000048510a30a5...,https://i.scdn.co/image/ab67616d0000b2730a30a5...
11664,6d48OUlyh79FQDhbwWgKxF,"Colección Completa, Vol. 53 (Remasterizado)",1930-12-05,17,album,https://i.scdn.co/image/ab67616d00004851a88f55...,https://i.scdn.co/image/ab67616d0000b273a88f55...
11665,47dahsZ5pIBo8mNH0qeL0C,Borg Mesch - EP,1930-04-01,5,single,https://i.scdn.co/image/ab67616d000048514e542e...,https://i.scdn.co/image/ab67616d0000b2734e542e...
11666,2jfKd4aGugEQmaPzXZ5L0d,"Colección Completa, Vol. 52 (Remasterizado)",1930-12-03,18,album,https://i.scdn.co/image/ab67616d00004851fe9c4e...,https://i.scdn.co/image/ab67616d0000b273fe9c4e...


In [18]:
albums_df.columns

Index(['album_id', 'album_name', 'release_date', 'total_tracks', 'album_type',
       'album_cover_64x64', 'album_cover_640x640'],
      dtype='object')

In [19]:
albums_df.shape

(11668, 7)

In [20]:
albums_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11668 entries, 0 to 11667
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   album_id             11668 non-null  object
 1   album_name           11668 non-null  object
 2   release_date         11668 non-null  object
 3   total_tracks         11668 non-null  int64 
 4   album_type           11668 non-null  object
 5   album_cover_64x64    11668 non-null  object
 6   album_cover_640x640  11668 non-null  object
dtypes: int64(1), object(6)
memory usage: 638.2+ KB


In [21]:
print(albums_df.isnull())

# Step 3: Get the count of nulls in each column
print(albums_df.isnull().sum())

       album_id  album_name  release_date  total_tracks  album_type  \
0         False       False         False         False       False   
1         False       False         False         False       False   
2         False       False         False         False       False   
3         False       False         False         False       False   
4         False       False         False         False       False   
...         ...         ...           ...           ...         ...   
11663     False       False         False         False       False   
11664     False       False         False         False       False   
11665     False       False         False         False       False   
11666     False       False         False         False       False   
11667     False       False         False         False       False   

       album_cover_64x64  album_cover_640x640  
0                  False                False  
1                  False                False  
2  

# PROCESSED SONGS  DATA SETS 
  1) NO NULL VALUES 
  2) ADD MOOD COLUMNS
  

In [94]:
# Import libraries
import pandas as pd  # for data manipulation
import numpy as np   # for numerical operations

# Load datasets
songs_df = pd.read_csv('D:\\snap_tune_final\\snap_tune_project\\archive\\songs_with_audio_feature.csv')
artists_df = pd.read_csv('D:\\snap_tune_final\\snap_tune_project\\archive\\artists.csv')
albums_df = pd.read_csv('D:\\snap_tune_final\\snap_tune_project\\archive\\albums.csv')

# Convert artist_ids from string to list
songs_df['artist_ids'] = songs_df['artist_ids'].apply(eval)

# Aggregate genres from all artists involved in a song
def aggregate_genres(group):
    genres = artists_df[artists_df['artist_id'].isin(group['artist_ids'])]['genres'].tolist()
    flat_list = [g for sublist in [eval(g) if isinstance(g, str) else g for g in genres] for g in sublist]
    return ', '.join(set(flat_list)) if flat_list else 'Unknown'

# Group by track_id to avoid duplicates and keep relevant audio features and metadata
songs_df = songs_df.groupby('track_id').agg({
    'track_name': 'first',
    'artist_names': 'first',
    'artist_ids': 'first',
    'valence': 'first',
    'danceability': 'first',
    'energy': 'first',
    'album_id': 'first'  #  merging with album info
}).reset_index()

# Add genre info to each track
songs_df['genres'] = songs_df.apply(aggregate_genres, axis=1)

# Assign mood labels based on valence (positiveness of a track)
def assign_mood(valence):
    if valence > 0.6:
        return 'Upbeat'
    elif valence < 0.4:
        return 'Sad'
    else:
        return 'Neutral'

songs_df['mood'] = songs_df['valence'].apply(assign_mood)

# Merge with album data to get album name, release date, and cover image
songs_df = pd.merge(
    songs_df,
    albums_df[['album_id', 'album_name', 'release_date', 'album_cover_640x640']],
    on='album_id',
    how='left'
)

# Final dataset structure
final_df = songs_df[[
    'track_id', 'track_name', 'artist_names', 'genres',
    'valence', 'danceability', 'energy', 'mood',
    'album_name', 'release_date', 'album_cover_640x640'
]]

# Save the cleaned and enriched dataset
final_df.to_csv('D:\\snap_tune_final\\snap_tune_project\\proceesed_songs\\processed_songs_final.csv', index=False)

# Output confirmation
print(f" Processed dataset shape: {final_df.shape}")
print(final_df.head())


 Processed dataset shape: (56580, 11)
                 track_id                            track_name  \
0  001ZmOPuWEW5czwun7nkha                Would You? (End Title)   
1  001ZmOPuWEW5czwun7nkha                Would You? (End Title)   
2  001ZmOPuWEW5czwun7nkha                Would You? (End Title)   
3  003FTlCpBTM4eSqYSWPv4H                          Swing, Swing   
4  003JzPprzThp8SHUctgXnn  Willow Weep For Me - Remastered 1998   

                    artist_names  \
0  ['Gene Kelly', 'Betty Noyes']   
1  ['Gene Kelly', 'Betty Noyes']   
2  ['Gene Kelly', 'Betty Noyes']   
3   ['The All-American Rejects']   
4            ['Thelonious Monk']   

                                              genres  valence  danceability  \
0    movie tunes, adult standards, vintage hollywood    0.162         0.160   
1    movie tunes, adult standards, vintage hollywood    0.162         0.160   
2    movie tunes, adult standards, vintage hollywood    0.162         0.160   
3  alternative metal, pop 

In [10]:

# Load processed dataset
processed_songs_df = pd.read_csv('D:\\snap_tune_final\\snap_tune_project\\proceesed_songs\\processed_songs_final.csv')

In [11]:
processed_songs_df.shape

(56580, 11)

In [12]:
# Check for duplicate songs (based on track_id)
duplicate_tracks = processed_songs_df[processed_songs_df.duplicated(subset=['track_id'], keep=False)]
print(f"Number of rows with duplicate track_id: {len(duplicate_tracks)}")
print(f"Unique track_id count: {processed_songs_df['track_id'].nunique()}")

Number of rows with duplicate track_id: 36914
Unique track_id count: 35200


# CHECKED FOR DUPLICATES 

In [13]:


# Load processed dataset
processed_songs_df = pd.read_csv('D:\\snap_tune_final\\snap_tune_project\\proceesed_songs\\processed_songs_final.csv')

# Check for duplicate songs (based on track_id)
duplicate_tracks = processed_songs_df[processed_songs_df.duplicated(subset=['track_id'], keep=False)]
print(f"Number of rows with duplicate track_id: {len(duplicate_tracks)}")
print(f"Unique track_id count: {processed_songs_df['track_id'].nunique()}")

# Example of duplicates (only if duplicates exist)
if len(duplicate_tracks) > 0:
    print("\nSample duplicates:")
    print(duplicate_tracks[['track_id', 'track_name', 'artist_names', 'genres']].head(10))
else:
    print("\nNo duplicates found.")

Number of rows with duplicate track_id: 36914
Unique track_id count: 35200

Sample duplicates:
                  track_id                               track_name  \
0   001ZmOPuWEW5czwun7nkha                   Would You? (End Title)   
1   001ZmOPuWEW5czwun7nkha                   Would You? (End Title)   
2   001ZmOPuWEW5czwun7nkha                   Would You? (End Title)   
4   003JzPprzThp8SHUctgXnn     Willow Weep For Me - Remastered 1998   
5   003JzPprzThp8SHUctgXnn     Willow Weep For Me - Remastered 1998   
6   003vvx7Niy0yvhvHt4a68B                           Mr. Brightside   
7   003vvx7Niy0yvhvHt4a68B                           Mr. Brightside   
9   004TG0nRHejwSKisvwTcAB               The Garden of Gethesemanie   
10  004TG0nRHejwSKisvwTcAB               The Garden of Gethesemanie   
14  006fbuXS6rRAWlUEaklCmt  Kapitel 297 - Der Page und die Herzogin   

                           artist_names  \
0         ['Gene Kelly', 'Betty Noyes']   
1         ['Gene Kelly', 'Betty Noyes

# DROP DUPLICATES AND KEEPING THE FIRST OCCURENCES 

In [14]:
# Drop duplicates based on track_id and keep the first occurrence
deduped_df = processed_songs_df.drop_duplicates(subset='track_id', keep='first')

# Save cleaned version
deduped_df.to_csv('D:\\snap_tune_final\\snap_tune_project\\proceesed_songs\\processed_songs_deduped.csv', index=False)

print(f" Deduplicated dataset shape: {deduped_df.shape}")

 Deduplicated dataset shape: (35200, 11)


In [32]:

# Check for duplicate songs (based on track_id)
duplicate_tracks = deduped_df[deduped_df.duplicated(subset=['track_id'], keep=False)]
print(f"Number of rows with duplicate track_id: {len(duplicate_tracks)}")
print(f"Unique track_id count: {deduped_df['track_id'].nunique()}")

# Example of duplicates (only if duplicates exist)
if len(duplicate_tracks) > 0:
    print("\nSample duplicates:")
    print(duplicate_tracks[['track_id', 'track_name', 'artist_names', 'genres']].head(10))
else:
    print("\nNo duplicates found.")

Number of rows with duplicate track_id: 0
Unique track_id count: 35200

No duplicates found.


In [15]:
# Check for duplicates on track_id
duplicates_by_id = deduped_df[deduped_df['track_id'].duplicated()]
print("Duplicates by track_id:", len(duplicates_by_id))

# Check for duplicates on track_name + artist_names
duplicates_by_name_artist = deduped_df[deduped_df[['track_name', 'artist_names']].duplicated()]
print("Duplicates by name and artist:", len(duplicates_by_name_artist))

Duplicates by track_id: 0
Duplicates by name and artist: 1966


In [33]:
duplicates_by_name_artist = deduped_df[deduped_df.duplicated(subset=['track_name', 'artist_names'], keep=False)]
print(duplicates_by_name_artist[['track_name', 'artist_names', 'valence', 'energy', 'album_name']].head(10))


                                           track_name  \
3                                      Mr. Brightside   
14                                    Tight Like This   
16                                    Ya Es Muy Tarde   
24  Gloomy Sunday (with Teddy Wilson & His Orchest...   
74                                            Changes   
77                                Big Girls Don't Cry   
82                         Papa's Got A Brand New Bag   
89                                       Boogie Shoes   
93                  Suicidal Thoughts - 2005 Remaster   
94                               You Made Me Love You   

                                     artist_names  valence  energy  \
3                                 ['The Killers']   0.2360  0.9110   
14  ['Louis Armstrong & His Savoy Ballroom Five']   0.4380  0.1990   
16                                ['Los Panchos']   0.7820  0.3140   
24             ['Billie Holiday', 'Teddy Wilson']   0.2230  0.0433   
74                    

In [35]:
deduped_df['release_date'] = pd.to_datetime(deduped_df['release_date'], errors='coerce')

# Sort to keep most recent
deduped_sorted = deduped_df.sort_values(by='release_date', ascending=False)

# Drop duplicates keeping most recent
refined_df = deduped_sorted.drop_duplicates(subset=['track_name', 'artist_names'], keep='first')
print(f"Refined dataset shape: {refined_df.shape}")


Refined dataset shape: (33234, 11)


In [36]:
refined_df

Unnamed: 0,track_id,track_name,artist_names,genres,valence,danceability,energy,mood,album_name,release_date,album_cover_640x640
8439,1LmN9SSHISbtp9LoaR5ZVJ,Payphone,"['Maroon 5', 'Wiz Khalifa']","rap, southern hip hop, pop, trap, pop rap, pit...",0.523,0.739,0.7560,Neutral,Overexposed,2021-06-20,https://i.scdn.co/image/ab67616d0000b2733119f4...
33329,7IfdPERQJutZVYZemjrPBG,Peligrosa,"['J Balvin', 'Wisin & Yandel']","trap latino, electro latino, reggaeton, reggae...",0.680,0.769,0.7590,Upbeat,DJ Home Office Vol. 1,2020-11-20,https://i.scdn.co/image/ab67616d0000b273beccbe...
11850,249gnXrbfmV8NG6jTEMSwD,Life Goes On,['BTS'],"pop, k-pop, k-pop boy group",0.450,0.566,0.7160,Neutral,BE,2020-11-20,https://i.scdn.co/image/ab67616d0000b2733deb4b...
30774,6d97KmrQGLdE6TVfGTW3PD,Mami,['J Balvin'],"reggaeton colombiano, urbano latino, trap lati...",0.924,0.727,0.7760,Upbeat,Playa Reggae,2020-11-20,https://i.scdn.co/image/ab67616d0000b273b1bd4d...
23319,4j9bUBWorNXVrmT2fppDgv,Safari,"['J Balvin', 'Pharrell Williams', 'BIA', 'Sky']","rap latina, trap latino, trap queen, pop, danc...",0.604,0.689,0.6750,Upbeat,Baila el dembow,2020-11-20,https://i.scdn.co/image/ab67616d0000b273c41c25...
...,...,...,...,...,...,...,...,...,...,...,...
35188,7zrxGPR1UVK2iSK793vLPl,Spanish Pipedream,['John Prine'],"folk, new americana, singer-songwriter, roots ...",0.854,0.671,0.4010,Upbeat,John Prine,NaT,https://i.scdn.co/image/ab67616d0000485103c6a7...
35193,7zto61V8ySp03Qi6X1LU2X,Tin Pan Alley (AKA Roughest Place in Town),['Stevie Ray Vaughan'],"classic rock, instrumental rock, blues, electr...",0.168,0.576,0.0968,Sad,Couldn't Stand The Weather (Legacy Edition),NaT,https://i.scdn.co/image/ab67616d0000b273eb72e9...
35194,7zty9mmmuqQsn8s4zDH4nk,メリッサ,['PornoGraffitti'],Unknown,0.799,0.538,0.9280,Upbeat,,NaT,
35196,7zwNt8YuTz0wXYJEW64jbF,Freak Scene,['Dinosaur Jr.'],"slacker rock, lo-fi, power pop, noise rock, al...",0.222,0.290,0.9390,Sad,,NaT,


In [37]:
refined_df.to_csv('D:\\snap_tune_final\\snap_tune_project\\proceesed_songs\\processed_songs_refined.csv', index=False)
print("Refined dataset shape:", refined_df.shape)


Refined dataset shape: (33234, 11)


# check for few duplicates rows


In [38]:


duplicate_rows = refined_df[refined_df.duplicated()]
print("Duplicate rows (keeping the first occurrence as unique):")
print(duplicate_rows)

Duplicate rows (keeping the first occurrence as unique):
Empty DataFrame
Columns: [track_id, track_name, artist_names, genres, valence, danceability, energy, mood, album_name, release_date, album_cover_640x640]
Index: []


In [39]:
refined_df.columns  #check for the columns in the dataframe


Index(['track_id', 'track_name', 'artist_names', 'genres', 'valence',
       'danceability', 'energy', 'mood', 'album_name', 'release_date',
       'album_cover_640x640'],
      dtype='object')

In [40]:
refined_df['mood'].unique()  #check for the unique values

array(['Neutral', 'Upbeat', 'Sad'], dtype=object)

In [41]:
refined_df.isnull().sum()  #chec for null columns 

track_id                   0
track_name                 0
artist_names               0
genres                     0
valence                    0
danceability               0
energy                     0
mood                       0
album_name              5883
release_date           14554
album_cover_640x640     5883
dtype: int64

In [42]:
print(refined_df['genres'].head(10)) #print first 10 generes
print(refined_df['artist_names'].head(10)) #print first 10 artists names 

8439     rap, southern hip hop, pop, trap, pop rap, pit...
33329    trap latino, electro latino, reggaeton, reggae...
11850                          pop, k-pop, k-pop boy group
30774    reggaeton colombiano, urbano latino, trap lati...
23319    rap latina, trap latino, trap queen, pop, danc...
20872    reggaeton colombiano, urbano latino, trap lati...
18561    trap latino, latin hip hop, reggaeton, reggaet...
30413    reggaeton colombiano, urbano latino, trap lati...
30831    r&b, trap latino, pop, pop dance, reggaeton, r...
22012    reggaeton colombiano, urbano latino, trap lati...
Name: genres, dtype: object
8439                         ['Maroon 5', 'Wiz Khalifa']
33329                     ['J Balvin', 'Wisin & Yandel']
11850                                            ['BTS']
30774                                       ['J Balvin']
23319    ['J Balvin', 'Pharrell Williams', 'BIA', 'Sky']
20872                                       ['J Balvin']
18561                       ['J Balvin',

In [43]:
df_temp = deduped_df.copy()
df_temp['mood_encoded'] = df_temp['mood'].astype('category').cat.codes

print(df_temp.corr(numeric_only=True)['mood_encoded'].sort_values(ascending=False))


mood_encoded    1.000000
valence         0.574561
danceability    0.271144
energy          0.172178
Name: mood_encoded, dtype: float64


In [45]:
deduped_df['mood'].value_counts()   #check for the count of the values  OR count the unique values



mood
Upbeat     16264
Sad        10594
Neutral     8342
Name: count, dtype: int64

# MUSIC SENTIMENT DATA SETS

In [27]:
#load the data sets
music_sentiment_df=pd.read_csv('D:\\snap_tune_final\\snap_tune_project\\datas\\music_sentiment_data-set\\music_sentiment_dataset.csv')

In [28]:
music_sentiment_df

Unnamed: 0,User_ID,User_Text,Sentiment_Label,Recommended_Song_ID,Song_Name,Artist,Genre,Tempo (BPM),Mood,Energy,Danceability
0,U1,Way ball purpose public experience recently re...,Sad,S1,Someone Like You,Adele,Pop,67,Melancholic,Low,Low
1,U2,Save officer two myself a.,Happy,S2,Happy,Pharrell Williams,Pop,160,Joyful,High,High
2,U3,Decade ahead everyone environment themselves a...,Relaxed,S3,Clair de Lune,Debussy,Classical,60,Soothing,Low,Low
3,U4,Best change letter citizen try ask quality pro...,Happy,S4,Happy,Pharrell Williams,Pop,160,Joyful,High,High
4,U5,Worker player chance kind actually.,Happy,S5,Happy,Pharrell Williams,Pop,160,Joyful,High,High
...,...,...,...,...,...,...,...,...,...,...,...
995,U996,Where carry sit argue through street.,Relaxed,S996,Clair de Lune,Debussy,Classical,60,Soothing,Low,Low
996,U997,Agreement state up hope free manage outside.,Happy,S997,Happy,Pharrell Williams,Pop,160,Joyful,High,High
997,U998,Particular else challenge ball.,Relaxed,S998,Clair de Lune,Debussy,Classical,60,Soothing,Low,Low
998,U999,Much level someone.,Relaxed,S999,Weightless,Marconi Union,Ambient,50,Calm,Low,Low


In [29]:
#MUSIC SENTIMENT DATASET
music_sentiment_df.shape

(1000, 11)

In [30]:
music_sentiment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   User_ID              1000 non-null   object
 1   User_Text            1000 non-null   object
 2   Sentiment_Label      1000 non-null   object
 3   Recommended_Song_ID  1000 non-null   object
 4   Song_Name            1000 non-null   object
 5   Artist               1000 non-null   object
 6   Genre                1000 non-null   object
 7   Tempo (BPM)          1000 non-null   int64 
 8   Mood                 1000 non-null   object
 9   Energy               1000 non-null   object
 10  Danceability         1000 non-null   object
dtypes: int64(1), object(10)
memory usage: 86.1+ KB


In [31]:
music_sentiment_df.columns

Index(['User_ID', 'User_Text', 'Sentiment_Label', 'Recommended_Song_ID',
       'Song_Name', 'Artist', 'Genre', 'Tempo (BPM)', 'Mood', 'Energy',
       'Danceability'],
      dtype='object')

In [46]:
repeated_rows = music_sentiment_df[music_sentiment_df.duplicated()]
print(f"Number of repeated rows: {len(repeated_rows)}")

Number of repeated rows: 0


In [47]:
music_sentiment_df.isnull().sum()  #check for null columns

User_ID                0
User_Text              0
Sentiment_Label        0
Recommended_Song_ID    0
Song_Name              0
Artist                 0
Genre                  0
Tempo (BPM)            0
Mood                   0
Energy                 0
Danceability           0
dtype: int64

# SPOTIFY API DATA EXTRACTIONS 

In [67]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(
    client_id='1a2f9e1a1da04977a83af40d5fa04a33',
    client_secret='32fab8c0c8a145e78a0a6e3bd9308fde',
    redirect_uri='http://127.0.0.1:5000/callback',
    scope='user-library-read user-read-private playlist-read-private'
))

In [68]:
test_track = "63fzyQDlfUvWsWqwCgZU7H"
try:
    features = sp.audio_features([test_track])
    print(f"Test track features: {features}")
except Exception as e:
    print(f"Error on single track: {e.http_status}, {e.msg}, {e.reason}")
    print(f"Full error response: {e.__dict__}")

HTTP Error for GET to https://api.spotify.com/v1/audio-features/?ids=63fzyQDlfUvWsWqwCgZU7H with Params: {} returned 403 due to None


Error on single track: 403, https://api.spotify.com/v1/audio-features/?ids=63fzyQDlfUvWsWqwCgZU7H:
 None, None
Full error response: {'http_status': 403, 'code': -1, 'msg': 'https://api.spotify.com/v1/audio-features/?ids=63fzyQDlfUvWsWqwCgZU7H:\n None', 'reason': None, 'headers': {'content-type': 'application/json; charset=utf-8', 'cache-control': 'private, max-age=0', 'access-control-allow-origin': '*', 'access-control-allow-headers': 'Accept, App-Platform, Authorization, Content-Type, Origin, Retry-After, Spotify-App-Version, X-Cloud-Trace-Context, client-token, content-access-token', 'access-control-allow-methods': 'GET, POST, OPTIONS, PUT, DELETE, PATCH', 'access-control-allow-credentials': 'true', 'access-control-max-age': '604800', 'content-encoding': 'gzip', 'strict-transport-security': 'max-age=31536000', 'x-content-type-options': 'nosniff', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000, h3=":443"; ma=2592000,h3-29=":443"; ma=2592000', 'date': 'Fri, 11 Jul 2025 13:2

In [66]:
try:
    track_info = sp.track(test_track)
    print(f"Track name: {track_info['name']}, Available markets: {track_info['available_markets']}")
except Exception as e:
    print(f"Error fetching track info: {e}")

Track name: Mosang Ege, Available markets: ['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DK', 'DO', 'DE', 'EC', 'EE', 'SV', 'FI', 'FR', 'GR', 'GT', 'HN', 'HK', 'HU', 'IS', 'IE', 'IT', 'LV', 'LT', 'LU', 'MY', 'MT', 'MX', 'NL', 'NZ', 'NI', 'NO', 'PA', 'PY', 'PE', 'PH', 'PL', 'PT', 'SG', 'SK', 'ES', 'SE', 'CH', 'TW', 'TR', 'UY', 'US', 'GB', 'AD', 'LI', 'MC', 'ID', 'JP', 'TH', 'VN', 'RO', 'IL', 'ZA', 'SA', 'AE', 'BH', 'QA', 'OM', 'KW', 'EG', 'MA', 'DZ', 'TN', 'LB', 'JO', 'PS', 'IN', 'BY', 'KZ', 'MD', 'UA', 'AL', 'BA', 'HR', 'ME', 'MK', 'RS', 'SI', 'KR', 'BD', 'PK', 'LK', 'GH', 'KE', 'NG', 'TZ', 'UG', 'AG', 'AM', 'BS', 'BB', 'BZ', 'BT', 'BW', 'BF', 'CV', 'CW', 'DM', 'FJ', 'GM', 'GE', 'GD', 'GW', 'GY', 'HT', 'JM', 'KI', 'LS', 'LR', 'MW', 'MV', 'ML', 'MH', 'FM', 'NA', 'NR', 'NE', 'PW', 'PG', 'PR', 'WS', 'SM', 'ST', 'SN', 'SC', 'SL', 'SB', 'KN', 'LC', 'VC', 'SR', 'TL', 'TO', 'TT', 'TV', 'VU', 'AZ', 'BN', 'BI', 'KH', 'CM', 'TD', 'KM', 'GQ', 'SZ', 'GA', 'GN', 'K