In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, GridSearchCV
import copy
import warnings
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn import svm

## Suppress warnings on console
warnings.filterwarnings("default", category=DeprecationWarning)

In [2]:
# Read input file

input_file = pd.read_csv('./Final_Files/sample_refined_updated.csv')

# Shuffle data frame

input_file = input_file.sample(frac=1)
input_file.head()

Unnamed: 0,acousticness,analysis_url,danceability,duration_ms,energy,track_id,instrumentalness,key,liveness,loudness,...,track_href,type,uri,valence,artist_name,track_name,popularity,bbhot,artist_popularity,artist_followers
1147,0.725,https://api.spotify.com/v1/audio-analysis/3kyw...,0.581,184893,0.298,3kywzyEr7V106Un5cDyh12,0.0,11,0.116,-9.54,...,https://api.spotify.com/v1/tracks/3kywzyEr7V10...,audio_features,spotify:track:3kywzyEr7V106Un5cDyh12,0.168,Sasha Sloan,Older,69,0,73,142897
3551,0.0588,https://api.spotify.com/v1/audio-analysis/6rsm...,0.795,190347,0.7,6rsm9NTgl9kKPatf7S1yCS,0.0,6,0.0977,-5.221,...,https://api.spotify.com/v1/tracks/6rsm9NTgl9kK...,audio_features,spotify:track:6rsm9NTgl9kKPatf7S1yCS,0.798,Why Don't We,Talk,71,1,81,1267297
2528,0.0428,https://api.spotify.com/v1/audio-analysis/3b5L...,0.69,192112,0.876,3b5Li4QKDVBx1x7fQuu54a,0.0,2,0.147,-4.159,...,https://api.spotify.com/v1/tracks/3b5Li4QKDVBx...,audio_features,spotify:track:3b5Li4QKDVBx1x7fQuu54a,0.493,Alan Walker,Tired,73,1,88,9121047
173,0.0142,https://api.spotify.com/v1/audio-analysis/2EIM...,0.861,220286,0.418,2EIMkm48UVEdYyFSXD99Fc,0.0,1,0.117,-8.258,...,https://api.spotify.com/v1/tracks/2EIMkm48UVEd...,audio_features,spotify:track:2EIMkm48UVEdYyFSXD99Fc,0.384,Shoreline Mafia,Dirty,49,1,76,283800
4348,0.00387,https://api.spotify.com/v1/audio-analysis/67EU...,0.608,329050,0.528,67EUBsc4qosGOVbaYy6TeX,1.4e-05,1,0.106,-11.722,...,https://api.spotify.com/v1/tracks/67EUBsc4qosG...,audio_features,spotify:track:67EUBsc4qosGOVbaYy6TeX,0.454,Cavetown,Boys Will Be Bugs,73,0,72,418180


In [3]:
## Dropping fields (strings/objects)

'''
drop_list = ['analysis_url','track_id', 'track_href', 'type', 'uri', 
             'artist_name', 'track_name']
'''
drop_list = ['analysis_url','track_id', 'track_href', 'type', 'uri', 'energy', 'instrumentalness',
             'artist_name', 'track_name', 'key', 'mode', 'time_signature', 'popularity']

train = input_file.drop(drop_list, axis = 1)

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 1147 to 52
Data columns (total 11 columns):
acousticness         5000 non-null float64
danceability         5000 non-null float64
duration_ms          5000 non-null int64
liveness             5000 non-null float64
loudness             5000 non-null float64
speechiness          5000 non-null float64
tempo                5000 non-null float64
valence              5000 non-null float64
bbhot                5000 non-null int64
artist_popularity    5000 non-null int64
artist_followers     5000 non-null int64
dtypes: float64(7), int64(4)
memory usage: 468.8 KB


In [4]:
## Making a copy of dataframe target variable and assigning it to Y
Y = copy.deepcopy(train.bbhot)
Y.shape

(5000,)

In [5]:
## Dropping the target attribute and making new dataframe

train_1 = train.drop("bbhot", axis=1)
train_1.head()

Unnamed: 0,acousticness,danceability,duration_ms,liveness,loudness,speechiness,tempo,valence,artist_popularity,artist_followers
1147,0.725,0.581,184893,0.116,-9.54,0.0698,63.761,0.168,73,142897
3551,0.0588,0.795,190347,0.0977,-5.221,0.107,120.002,0.798,81,1267297
2528,0.0428,0.69,192112,0.147,-4.159,0.059,123.994,0.493,88,9121047
173,0.0142,0.861,220286,0.117,-8.258,0.419,140.087,0.384,76,283800
4348,0.00387,0.608,329050,0.106,-11.722,0.0338,89.221,0.454,72,418180


In [6]:
## Creating standard scaler object

warnings.filterwarnings("ignore")

#scaler = StandardScaler()
scaler = MinMaxScaler()

## Fit scaler to features

scaler.fit(train_1)

# Transform features to scaled version

scaled_features = scaler.transform(train_1)

#Convert the scaled features to a dataframe and check the head of this dataframe to make sure the scaling worked.

df_feat = pd.DataFrame(scaled_features,columns=train_1.columns)
df_feat.head()

Unnamed: 0,acousticness,danceability,duration_ms,liveness,loudness,speechiness,tempo,valence,artist_popularity,artist_followers
0,0.727911,0.597737,0.113314,0.100479,0.759266,0.072257,0.289692,0.169697,0.73,0.003271
1,0.059033,0.817901,0.117443,0.081405,0.861753,0.110766,0.545218,0.806061,0.81,0.029009
2,0.042969,0.709877,0.11878,0.132791,0.886954,0.061077,0.563356,0.49798,0.88,0.208783
3,0.014254,0.885802,0.140112,0.101522,0.789687,0.433747,0.636473,0.387879,0.76,0.006496
4,0.003882,0.625514,0.222466,0.090056,0.707489,0.03499,0.405368,0.458586,0.72,0.009572


In [7]:
#Function to evaluate my model with Cross validation

from sklearn.model_selection import cross_val_score

def testingModel(model, X_train, Y_train):
    scores = cross_val_score(model, X_train, Y_train, cv=10, scoring = "roc_auc")
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard Deviation:", scores.std())
    return scores.mean()

In [9]:
## Implementing SVM with cross validation

from sklearn.svm import SVC, LinearSVC
linear_svc = LinearSVC()
linear_svc.fit(df_feat, Y)

acc_linear_svc = testingModel(linear_svc, df_feat, Y)
print("Prediction Accuracy With 10-Fold Cross Validation(Mean Value): %.2f%%" % (acc_linear_svc * 100.0))

Scores: [0.610304 0.62104  0.632    0.651504 0.657264 0.661488 0.639504 0.65416
 0.64448  0.669072]
Mean: 0.6440816
Standard Deviation: 0.01759791034867493
Prediction Accuracy With 10-Fold Cross Validation(Mean Value): 64.41%


In [10]:
## Training a SVM model without cross validation

from sklearn.metrics import classification_report,confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

svc_model = LinearSVC()

X = df_feat
y = Y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

svc_model.fit(X_train,y_train)

predictions = svc_model.predict(X_test)

accuracy = svc_model.score(X_test, y_test)

print("Prediction Accuracy Without Cross Validation: %.2f%%" % (accuracy * 100.0))

print("Confusion Matrix: ")
print(confusion_matrix(y_test,predictions))

print("Classification Report: ")
print(classification_report(y_test,predictions))

Prediction Accuracy Without Cross Validation: 59.87%
Confusion Matrix: 
[[476 252]
 [350 422]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.58      0.65      0.61       728
           1       0.63      0.55      0.58       772

   micro avg       0.60      0.60      0.60      1500
   macro avg       0.60      0.60      0.60      1500
weighted avg       0.60      0.60      0.60      1500



In [11]:
#sns.pairplot(train,hue='bbhot',palette='Dark2')


In [12]:
## Gridsearch practice

from sklearn.model_selection import GridSearchCV

# Create a dictionary called param_grid and fill out some parameters for C and gamma.

param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001]} 

# ** Create a GridSearchCV object and fit it to the training data.**

grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
grid.fit(X_train,y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................... C=0.1, gamma=1, total=   0.2s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] ................................... C=0.1, gamma=1, total=   0.2s
[CV] C=0.1, gamma=1 ..................................................
[CV] ................................... C=0.1, gamma=1, total=   0.2s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ................................. C=0.1, gamma=0.1, total=   0.2s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ................................. C=0.1, gamma=0.1, total=   0.2s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ................................. C=0.1, gamma=0.1, total=   0.2s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ................................ C=0.1, gamma=0.01, total=   0.2s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ................................ C=0.1, gamma=0.01, total=   0.2s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:   14.0s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [13]:
# ** Taking grid model and create some predictions using the test set 
# and create classification reports and confusion matrices for them

grid_predictions = grid.predict(X_test)

print(confusion_matrix(y_test,grid_predictions))

print(classification_report(y_test,grid_predictions))

accuracy = grid.score(X_test, y_test)

print("Prediction Accuracy with Grid Search: %.2f%%" % (accuracy * 100.0))
print("Best Hyperparameters for Grid Search: ",grid.best_estimator_)

[[463 316]
 [278 443]]
              precision    recall  f1-score   support

           0       0.62      0.59      0.61       779
           1       0.58      0.61      0.60       721

   micro avg       0.60      0.60      0.60      1500
   macro avg       0.60      0.60      0.60      1500
weighted avg       0.61      0.60      0.60      1500

Prediction Accuracy with Grid Search: 60.40%
Best Hyperparameters for Grid Search:  SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [16]:
## Implementing SGD Classifier (Stochastic Gradient Descent)

from sklearn import linear_model
sgd = linear_model.SGDClassifier(max_iter=5, tol=None)
acc_sgd = testingModel(sgd, df_feat, Y)  

print("Accuracy: %.2f%%" % (acc_linear_svc * 100.0))

Scores: [0.617984 0.575152 0.627616 0.63456  0.59904  0.606096 0.570512 0.599008
 0.607776 0.652528]
Mean: 0.6090272000000001
Standard Deviation: 0.024106528164793868
Accuracy: 62.81%


In [17]:
svc_model

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [21]:
grid.best_estimator_

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)