# Initial EDA

In [1]:
# Importing the necessary libraries to conduct EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import plotly.express as px
from sklearn.manifold import TSNE
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib
from spotipy.oauth2 import SpotifyOAuth

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importing data from csv file and aliasing
song_df = pd.read_csv('tracks_features.csv')

In [None]:
# Observing df head to sense check
song_df.head()

In [None]:
# Observing the df columns to further sense check and ensure rename successful
song_df.columns

In [None]:
# Observing the data types
song_df.dtypes

In [None]:
# Observing the df shape
song_df.shape

In [None]:
# Looking at df info to sense check further
print(song_df.info)

In [None]:
# Describing the df to sense check
song_df.describe()

In [None]:
# Selecting only the numerical columns
numerical_columns = song_df.select_dtypes(include=['number'])

# Getting the list of numerical column names
numerical_column_names = numerical_columns.columns

# Iterating through each numerical column and creating a histogram
for column in numerical_column_names:
    plt.figure(figsize=(10, 6))
    numerical_columns[column].hist(bins=20)
    plt.title(f'Histogram for {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

We want to see enough variability in the features of the songs in our dataset. The reason being if there is not enough variability there will not be any way to differentiate between our songs. If this is the case we won't be able to construct a particularly good recommendation engine!

In [None]:
# Plotting year again, as histogram scale (above) does not give enough granularity
plt.figure(figsize=(10, 6))
song_df['year'].hist(bins=800)
plt.title('Histogram for Year')
plt.xlabel('Year')
plt.ylabel('Frequency')

# Setting the x-axis range
plt.xlim(1900, 2024)
plt.show()

From the above we can see the data is somewhat limited in that we have no data prior to 2020. However, we have sufficient data to create a good recommendation engine. With further resource we could use Cloud services to host a much larger dataset.

In [None]:
# Count plot of explicit songs
plt.figure(figsize=(10, 6))
sns.countplot(x='explicit', data=song_df)
plt.title('Count Plot for Explicit Songs')
plt.xlabel('Explicit')
plt.ylabel('Frequency')
plt.show()

### Null Values

In [None]:
# Investigating null values
song_df.isnull().sum()

The data is incredibly clean with only 3 null values in the 'name' column and 11 null values in the 'album' column - we will address the null values later, since we have so few null values there shouldn't be a huge impact on the accuracy of the model.

### Multicolinearity

Plotting a correlation heatmap to understand correlation between numerical variables and give an idea of multicollinearity.

In [None]:
# Calculating the correlation matrix
correlation_matrix = numerical_columns.corr()

# Setting up the matplotlib figure
plt.figure(figsize=(12, 10))

# Creating a Seaborn heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)

# Customising the plot
plt.title('Correlation Heatmap of Numerical Columns in song_df')
plt.show()

# Genres Dataset

In [None]:
# Importing data from csv file and aliasing
genre_df = pd.read_csv('data_by_genres.csv')

In [None]:
# Exploring df 
genre_df.head()

In [None]:
# Investigating df columns
genre_df.columns

In [None]:
# Investigating df shape
genre_df.shape

In [None]:
# Investigating the number of unique genres to understand likelihood of meaningful clusters
unique_genres_count = genre_df['genres'].nunique()
print("Number of unique values in the 'genre' column:", unique_genres_count)