Spotify Recommendation System - Data Preprocessing

In [None]:
# Import Libraries
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go

Load data

In [None]:
data = pd.read_csv('../data/tracks.csv')
artist_data = pd.read_csv('../data/artists.csv')

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
track_features = ['name', 'artists', 'release_date','popularity', 'duration_ms', 'explicit', 'id', 'id_artists']
data.tail(5)[track_features]

In [None]:
audio_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness','mode','speechiness', 'tempo', 'valence']
data.tail(5)[audio_features]

Verify and clean data

In [None]:
def display_missing_values(df):
    for col in df.columns.tolist():
        print(f'{col} column missing values: {df[col].isnull().sum()}')
    print('\n')

In [None]:
display_missing_values(data)

In [None]:
data = data.dropna()
display_missing_values(data)

In [None]:
def check_duplicates(df):
    print(f"Total number of duplicates is: {df.duplicated(keep=False).sum()}")
    print('\n')

In [None]:
check_duplicates(data)

Group the data by year

In [None]:
# Group track_data by year
year_data_features = ['year', 'popularity', 'duration_ms'] + audio_features

# Convert 'release_date' to datetime format
data['release_date'] = pd.to_datetime(data['release_date'], infer_datetime_format=True, errors='coerce')

# Extract year from 'release_date'
data['year'] = data['release_date'].dt.year

# Group data by year
year_data = data.groupby('year', as_index=False)[year_data_features].mean()

year_data['year'] = year_data['year'].astype('int64')

year_data = year_data[(year_data['year'] >= 1921) & (year_data['year'] <= 2020)]

In [None]:
year_data.shape

In [None]:
year_data.head()

In [None]:
# Save year_data to csv
year_data.to_csv('../data/processed/year_data.csv', index=False)