In [111]:
###################################################
##          Missing Data                         ##
###################################################
#import warnings
#warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

music_df = pd.read_csv("music_clean.csv")
#music_df = music_df.drop("Unnamed: 0", axis=1)

display(music_df.head(5))
print(music_df.shape)

print(music_df.isna().sum().sort_values())  

Unnamed: 0.1,Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,genre
0,36506,60.0,0.896,0.726,214547.0,0.177,2e-06,0.116,-14.824,0.0353,92.934,0.618,1
1,37591,63.0,0.00384,0.635,190448.0,0.908,0.0834,0.239,-4.795,0.0563,110.012,0.637,1
2,37658,59.0,7.5e-05,0.352,456320.0,0.956,0.0203,0.125,-3.634,0.149,122.897,0.228,1
3,36060,54.0,0.945,0.488,352280.0,0.326,0.0157,0.119,-12.02,0.0328,106.063,0.323,1
4,35710,55.0,0.245,0.667,273693.0,0.647,0.000297,0.0633,-7.787,0.0487,143.995,0.3,1


(1000, 13)
Unnamed: 0          0
popularity          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
liveness            0
loudness            0
speechiness         0
tempo               0
valence             0
genre               0
dtype: int64


In [112]:
#Enter NaN values in the clean dataset
music_df.iloc[np.arange(1,92).tolist(),4] = np.nan
music_df.iloc[np.arange(1,92).tolist(),6] = np.nan
music_df.iloc[np.arange(1,53).tolist(),9] = np.nan
music_df.iloc[np.arange(1,127).tolist(),3] = np.nan
music_df.iloc[np.arange(1,178).tolist(),5] = np.nan
music_df.iloc[np.arange(1,8).tolist(),12] = np.nan
music_df.iloc[np.arange(1,31).tolist(),1] = np.nan
display(music_df.head(5))

print(music_df.isna().sum().sort_values())

Unnamed: 0.1,Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,genre
0,36506,60.0,0.896,0.726,214547.0,0.177,2e-06,0.116,-14.824,0.0353,92.934,0.618,1.0
1,37591,,0.00384,,,,,0.239,-4.795,,110.012,0.637,
2,37658,,7.5e-05,,,,,0.125,-3.634,,122.897,0.228,
3,36060,,0.945,,,,,0.119,-12.02,,106.063,0.323,
4,35710,,0.245,,,,,0.0633,-7.787,,143.995,0.3,


Unnamed: 0            0
acousticness          0
liveness              0
loudness              0
tempo                 0
valence               0
genre                 7
popularity           30
speechiness          52
duration_ms          91
instrumentalness     91
danceability        126
energy              177
dtype: int64


In [113]:
##############################################################################
#Dropping missing data
#We're going to tidy the dataset. 
#We'll create a pipeline to impute missing values and build a KNN classifier model, 
#then use it to predict whether a song is of the "Rock" genre.
##############################################################################
print(music_df.shape) # 1000 rows | 5% of 1000 = 50
#Remove values where less than 5% are missing | for all columns with 50 or fewer missing values
music_df = music_df.dropna(subset=["genre","popularity"])

# Convert genre to a binary feature
#music_df["genre"] = np.where(music_df["genre"] == "Rock", 1, 0)

print(music_df.isna().sum().sort_values())
print("Shape of the `music_df`: {}".format(music_df.shape))

#The dataset has gone from 1000 observations down to 970, 
#but it is now in the correct format for binary classification 
#and the remaining missing values can be imputed as part of a pipeline.


(1000, 13)
Unnamed: 0            0
popularity            0
acousticness          0
liveness              0
loudness              0
tempo                 0
valence               0
genre                 0
speechiness          22
duration_ms          61
instrumentalness     61
danceability         96
energy              147
dtype: int64
Shape of the `music_df`: (970, 13)


In [114]:
######################################################################################
#Pipeline for song genre prediction
#contain steps to impute missing values using the MEAN for each feature 
#and build a KNN model for the classification of song genre.
#Pipelines are so incredibly useful because of the simple interface that they provide.
######################################################################################
#music_df = pd.read_csv("music_clean.csv")

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Instantiate Imputer object using the default MEAN for strategy and 
# missing_values type for imputation
imputer = SimpleImputer()

# Instantiate a knn model
knn = KNeighborsClassifier(n_neighbors=3)

# Build steps for the pipeline
steps = [("imputer", imputer), 
         ("knn", knn)]

#Now we are ready to build and evaluate a song genre classification model.
#Use the pipeline on the dataset to classify the genre of songs. 

X = music_df.drop("genre",axis=1).values
y =  music_df["genre"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=12)

#Always split data before imputing to avoid data leakage

# Create the pipeline
pipeline = Pipeline(steps)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Print the confusion matrix
print(confusion_matrix(y_test, y_pred))

#It's easy to scale our model building workflow using pipelines. 
#In this case, the confusion matrix highlights that the model had 92 true positives and 88 true negatives!


[[92 12]
 [ 2 88]]
