In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pycaret.classification import *
from pycaret.clustering import *
import numpy as np

In [None]:
data1=pd.DataFrame(pd.read_csv(r'./spotify_data/feature_nov2018.csv'))
data2=pd.DataFrame(pd.read_csv(r'./spotify_data/feature_apr2019.csv'))
combined_audio_features=pd.concat([data1,data2])
combined_audio_features.to_csv(r'./spotify_data/audio_features.csv',index=False)

In [None]:
songs=pd.DataFrame(pd.read_csv(r'./spotify_data/audio_features.csv'))
print("Before Renaming Columns\n\n",songs.columns)
songs.drop(columns={'duration_ms','energy','instrumentalness','key','tempo',
                    'mode','time_signature', 'valence','popularity'},inplace=True)
songs.rename(columns={'artist_name':'artist','track_id':'id','track_name':'name'},inplace=True)
print("\n\nAfter Editing Columns\n\n",songs.columns)
print(songs.info())

Min Max Scaler Formula:

X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

X_scaled = X_std * (max - min) + min

In [None]:
from sklearn import preprocessing
loudness=songs[['loudness']].values
print(songs['loudness'].head())
min_max_scaler=preprocessing.MinMaxScaler()
loudness_scaled=min_max_scaler.fit_transform(loudness)
songs['loudness']=pd.DataFrame(loudness_scaled)
print(songs['loudness'].head())

In [None]:
songs.dropna(inplace=True)
# songs['track_id'].value_counts()
songs.drop_duplicates(subset=['name'],inplace=True)
songs.info()

In [None]:
print(songs.dtypes)
songs.hist(column=None,bins=50,figsize=(20,15))

In [None]:
songs_copied = songs.copy()
songs_copied = songs_copied.drop(['name','artist','id'],axis=1)
print(songs_copied.head())
print(songs_copied.info())

In [None]:
songs_features=songs_copied.astype({'danceability':'float32','loudness':'float32',
                    'speechiness':'float32','acousticness':'float32',
                    'liveness':'float32'},copy=True)
songs_features.info()
print(type(songs_features))

PyCaret Module is now getting used to train and predict the Model

Setup: 

This function initializes the training environment and creates the transformation pipeline. Setup function must be called before executing any other function. It takes one mandatory parameter: data. All the other parameters are optional.

In [None]:
feature_setup = setup(data=songs_features,preprocess=True,n_jobs=-1,use_gpu=True,imputation_type='iterative',
                      iterative_imputation_iters=10,numeric_imputation='mean',numeric_iterative_imputer='lightgbm')

In [None]:
kmeans=create_model(model='kmeans',num_clusters=4,round=4)

In [None]:
# assign labels using trained model
kmeans_df = assign_model(model=kmeans,transformation=True)
print(kmeans_df.info())
print(kmeans_df['Cluster'].value_counts())

In [None]:
print(kmeans_df.head())

In [None]:
plot_model(model=kmeans,plot='cluster')
#save=True, lets us save the image as HTML in our root directory

In [None]:
evaluate_model(kmeans)

In [None]:
# tuned_clustering=tune_model(data=kmeans_df,model=kmeans,supervised_target='Cluster',
#                             supervised_type='classification',supervised_estimator='rf',optimize='Accuracy')

#This wont work cause there is no Target field in the main csv file so we are adding a new csv file with cluster (target) column so that we can train and predict the data using Classifiers

In [None]:
pred_cluster_model=predict_model(model=kmeans,data=songs_features)
print(pred_cluster_model)

In [None]:
songs['Cluster']=kmeans_df['Cluster']
songs=songs.sample(frac=1)
songs.head(5)

In [None]:
print("Number of Songs in Each Cluster: \n\n",songs['Cluster'].value_counts())

In [None]:
print("\nCluster 0:\n\n",songs[songs['Cluster']=='Cluster 0'].mean())
print("\nCluster 1:\n\n",songs[songs['Cluster']=='Cluster 1'].mean())
print("\nCluster 2:\n\n",songs[songs['Cluster']=='Cluster 2'].mean())
print("\nCluster 3:\n\n",songs[songs['Cluster']=='Cluster 3'].mean())

In [None]:
print(songs[songs['Cluster']=='Cluster 0'].head(50))

In [None]:
print(songs[songs['Cluster']=='Cluster 1'].head(50))

In [None]:
print(songs[songs['Cluster']=='Cluster 2'].head(50))

In [None]:
print(songs[songs['Cluster']=='Cluster 3'].head(50))

**Classification of the Dataset Starting from here!**

In [None]:
songs.to_csv('./final_data/finalized.csv',index=False)

In [None]:
final_audio_analysis=pd.DataFrame(pd.read_csv('./final_data/finalized.csv'))
final_audio_analysis.info(memory_usage='deep')

In [None]:
# final_audio_analysis=audio_analysis.astype({'danceability':'float16','loudness':'float16',
#                     'speechiness':'float16','acousticness':'float16','liveness':'float16','Cluster':'category'},copy=False)
# print(final_audio_analysis.info(memory_usage='deep'))
# print(type(final_audio_analysis))

In [None]:
# final_audio_analysis.dropna(inplace=True)
# # songs['track_id'].value_counts()
# final_audio_analysis.drop_duplicates(subset=['name'],inplace=True)
# final_audio_analysis.info()

In [None]:
# final_audio_analysis=final_audio_analysis.astype({'Cluster':'category'},copy=True)
# print(final_audio_analysis.info())

In [None]:
from pycaret.classification import *


classify_setup=setup(data=final_audio_analysis,target="Cluster",train_size=0.75,preprocess=True,transformation=True,
                     imputation_type='iterative',iterative_imputation_iters=20,numeric_imputation='mean',
                     numeric_iterative_imputer='lightgbm',n_jobs=-1,use_gpu=True,
                     ignore_features=["name","id","artist"],normalize=True,normalize_method='robust',
                     remove_outliers=True,outliers_threshold=0.08)

In [None]:
top3 = compare_models(n_select=3,fold=10,round=4,sort='Accuracy',include=['lightgbm','rf','svm','lr','dt','et'])

In [None]:
lightgbm=create_model('lightgbm',round=4,cross_validation=True)

In [None]:
tuned_light=tune_model(estimator=lightgbm,round=4,n_iter=20,optimize='Accuracy')

In [None]:
evaluate_model(tuned_light)

In [None]:
predict_light=predict_model(estimator=tuned_light,data=final_audio_analysis)
print(predict_light.head())

In [None]:
finalized_light=finalize_model(estimator=tuned_light)

In [None]:
songanalysis_model=save_model(finalized_light,'songanalysis_model')