In [None]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Matplotlib and seaborn graphs style

sns.set(style="darkgrid", context="paper")
plt.style.use("dark_background")
plt.rcParams.update({"grid.linewidth": 0.5, "grid.alpha": 0.5})

%matplotlib inline

In [None]:
# Spotify app keys

client_id = "-"
client_secret = "-"

In [None]:
# Get the access token

AUTH_URL = 'https://accounts.spotify.com/api/token'

auth_response = requests.post(AUTH_URL, {
    'grant_type': 'client_credentials',
    'client_id': client_id,
    'client_secret': client_secret,
})

auth_response_data = auth_response.json()

access_token = auth_response_data['access_token']

In [None]:
BASE_URL = 'https://api.spotify.com/v1/'

headers = {
    'Authorization': f'Bearer {access_token}'
}

In [None]:
# Get the playlist's tracks

playlist_id = '-'

playlist_items = requests.get(BASE_URL + 'playlists/' + playlist_id + '/tracks', headers=headers)

In [None]:
playlist_items = playlist_items.json() # Convert the data to json

In [None]:
df = pd.DataFrame()
for item in playlist_items['items']:
  track = item['track']

  song = requests.get(BASE_URL + 'audio-features/' + str(item['track']['uri']).split(':')[2], headers=headers) # Get the song audio features
  song = song.json()

  track_df = pd.DataFrame({
      'trackname': [track['name']],
      'artists': [', '.join(pd.DataFrame(track['artists'])["name"].values)],
      'popularity': [track['popularity']],
      **song
  })

  df = pd.concat([df, track_df], ignore_index=True)

In [None]:
df.columns.values

In [None]:
# Dont need those columns

df.drop(['type', 'id', 'uri', 'track_href', 'time_signature', 'analysis_url'], axis=1, inplace=True)

In [None]:
missing_percentage = df.isna().sum() / len(df) * 100
missing_percentage = missing_percentage[missing_percentage != 0]
print(df.isna().sum(), missing_percentage)

In [None]:
df.info()

In [None]:
# Convert duration from milliseconds to seconds

df['duration'] = df['duration_ms'].apply(lambda x: x / 1000)

In [None]:
# Split the columns into numerical and categorical

numerical = [
  'popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration'
]
categorical = [
  'trackname', 'artists'
]

In [None]:
# Popularity distribution

sns.histplot(
    data=df,
    x='popularity',
    color='orange'
)

In [None]:
# Histograms for all the numerical fields

pal = iter(sns.color_palette("rocket", n_colors=len(numerical)))
plt.figure(figsize=(14, 12))
for i, col in enumerate(numerical):
    plt.subplot(4, int(len(numerical) / 4) + 1, i + 1)
    sns.histplot(
        x=str(col),
        data=df,
        color=next(pal),
        bins= 20
    )
plt.tight_layout()

In [None]:
# Correlation matrix

plt.figure(figsize=(10, 6))
correlation_matrix = df.corr(numeric_only=True)
sns.heatmap(correlation_matrix, xticklabels=correlation_matrix.columns, yticklabels=correlation_matrix.columns, annot=True)

In [None]:
# Scatterplot for energy and loudness

pal = iter(sns.color_palette("rocket", n_colors=3))

scatterplot_columns = [
    ('energy', 'loudness'),
    ('key', 'tempo'),
    ('duration', 'instrumentalness')
]

plt.figure(figsize=(12, 4))

for i, col in enumerate(range(3)):
    plt.subplot(1, 3, i + 1)
    sns.scatterplot(
        data=df,
        x=scatterplot_columns[i][0],
        y=scatterplot_columns[i][1],
        color=next(pal),
    )
plt.tight_layout()

In [None]:
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

models = [
    ('Random Forest', RandomForestClassifier()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('SVM', SVC()),
    ('Logistic Regression', LogisticRegression())
]

In [None]:
def fit_score_model(model, model_name, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train.ravel()) # Fit the model
    predictions = model.predict(X_test) # Predict the values
    test_rmse = mean_squared_error(y_test, predictions, squared=False) # Calculate the Root Mean Square Error

    score = model.score(X_test, y_test.ravel()) # Calculate the score

    print(f"Model Name: {model_name}, RMSE: {test_rmse}, Score: {score}")

In [None]:
df_model = df.drop(categorical, axis=1)

df_model["popularity"] = df_model["popularity"].apply(lambda popularity: 1 if popularity >= 66.5 else 0 )
df_model["popularity"].value_counts()

X = df_model.drop(['popularity'], axis=1)

scaler = StandardScaler()
X_normalized = scaler.fit_transform(X) # Scale the dataset

y = df_model['popularity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


for model_name, model in models:
  fit_score_model(model, model_name, X_train, y_train, X_test, y_test)