In [1]:
import pandas as pd
import numpy as np
#-----------------------------------------Scikit----------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
#------------------------------------import feature selection----------------------------------
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2            #score_func, need to normalize to remove negative to use
from sklearn.feature_selection import f_classif
#-----------------------------------------Seaborn----------------------------------------------
import matplotlib.pyplot as plt
import ssl
import seaborn as sns
ssl._create_default_https_context = ssl._create_unverified_context
sns.set(style='darkgrid')

In [2]:
df = pd.read_csv('./Data/features_30_sec.csv')          #read file
df_test = pd.read_csv('./data.csv')

In [3]:
df_test.head()                                            #display first 5 rows of data

Unnamed: 0,filename,chroma_stft,mfcc1,spectral_bandwidth,rolloff,label
0,blues.00065.wav,0.29273,2372.164643,5614.449384,-4.537642,
1,blues.00015.wav,0.270313,1731.192252,2930.540613,-3.679262,
2,blues.00044.wav,0.390212,2375.10212,5198.360233,3.518596,
3,blues.00002.wav,0.363603,1747.165985,3040.514948,1.03348,
4,blues.00098.wav,0.442972,2206.710936,4829.320798,1.836111,


In [4]:
df = df.drop(['length','filename'],axis=1)             #remove the length and filename column since irrelevant
df = df.sample(frac=1)                                 #randomize rows of dataset every run

df_test = df_test.drop(['filename','label'],axis=1)

In [5]:
#sns.scatterplot(data = df, x ='chroma_stft_mean',y='mfcc1_mean',hue='label')

In [6]:
#-----------------------------------------Encode The Genre Into Numbers-----------------------------------------
labelEncoder = LabelEncoder()                              #store encoded labels into variable
le = labelEncoder.fit(df['label'])                         #fit label into the variable
df['label'] = le.transform(df['label'])                    #transform label values into numbers
Y_genre = df['label']                                      #Assign label to Y_genre
X_features = df.drop('label',axis=1)                       #Assign all features to X_feature

In [7]:
#---------------------------------------Normalize the features between 0-1---------------------------------------
scalar = MinMaxScaler()                                    
scalar.fit(X_features)                                     #Fit features into scalar
X_cols = X_features.columns
X_features[X_cols] = scalar.transform(X_features)                 #Transform features into 0-1

scalar2 = MinMaxScaler()
scalar2.fit(df_test)
X_test_cols = df_test.columns
df_test[X_test_cols] = scalar2.transform(df_test)

In [8]:
#------------------------------Scatterplot of Data--------------------------------------------------
#sns.scatterplot(data = df, x ='chroma_stft_var',y='rolloff_mean',hue='label')

In [9]:
#------------------------------------------Feature Reduction-----------------------------------------------------
best_feat = SelectKBest(score_func= f_classif, k=4)          #Auto select 4 features that best differentiate
fit = best_feat.fit(X_features,Y_genre)                      #the genres

In [10]:
feat_scores = pd.DataFrame(fit.scores_)                      #Extract optimum score of each feature
feat_columns = pd.DataFrame(X_features.columns)              #Extract the column names

In [11]:
sel_scores = pd.concat([feat_columns,feat_scores],axis=1)    #Concatenate the two
sel_scores.columns = ['Features','Scores']                   #Name the two new columns

In [12]:
sel_scores.sort_values(by=['Scores'],ascending=False)        #Sort the scores so that highest is displayed first
sel_largest = sel_scores.nlargest(4,'Scores')                #Display the 4 best ones
sel_largest

Unnamed: 0,Features,Scores
0,chroma_stft_mean,176.453282
17,mfcc1_mean,130.371835
6,spectral_bandwidth_mean,116.601879
8,rolloff_mean,110.871317


In [13]:
X_features = X_features[sel_largest['Features'].T]           #Reduce the features to the 4 with best scores

In [14]:
#-------------------------------------Split data into train and test----------------------------------------
#rs = 42
#X_train,X_test,y_train,y_test = train_test_split(X_features,Y_genre,test_size=0.25,random_state=rs)

In [15]:
#-----------------------------------------Import algorithms------------------------------------------------------
from sklearn.ensemble import RandomForestClassifier

In [16]:
# df = df.sample(frac=1)                                #randomize rows of dataset
# row,col = pd.concat([X_features,Y_genre],axis=1).shape  #extract size of dataset
# split = 1.0                                          #3/4 training split
# X_train = df.iloc[:int(row*split),:-1]             #obtain 75% of test data
# y_train = df.iloc[:int(row*split),-1:]            #obtain 75% of genre data
# X_test = df.iloc[int(row*split):,:-1]              #obtain 25% of test data
# y_test = df.iloc[int(row*split):,-1:]             #obtain 25% of genre data

In [22]:
clf = RandomForestClassifier(n_estimators=180, max_depth=None,min_samples_split=4,min_samples_leaf=2,
                             max_features='auto')
clf.fit(X_features,Y_genre)

RandomForestClassifier(min_samples_leaf=2, min_samples_split=4,
                       n_estimators=180)

In [23]:
clf.predict(df_test)

array([7, 9, 7, 8, 4, 7, 1, 5, 8, 8, 7, 7, 2, 2, 4, 4, 8, 3, 8, 8, 4, 2,
       8, 4, 4, 4, 1, 4, 0, 4, 4, 8, 8, 9, 8, 8, 2, 9, 9, 8, 1, 4, 4, 4,
       7, 9, 8, 9, 7, 7, 6, 7, 2, 7, 8, 4, 8, 1, 9, 5, 8, 9, 8, 7, 8, 1,
       4, 9, 4, 8, 4, 4, 8, 2, 8, 8, 7, 7, 1, 4, 9, 9, 8, 7, 8, 4, 4, 7,
       4, 9, 7, 4, 5, 8, 9, 9, 2, 7, 7, 4, 8, 2, 1, 1, 5, 5, 5, 4, 0, 8,
       1, 3, 1, 1, 9, 1, 1, 9, 1, 5, 9, 0, 9, 0, 1, 1, 0, 1, 1, 8, 8, 9,
       5, 5, 1, 1, 1, 9, 1, 9, 5, 1, 1, 5, 1, 5, 1, 1, 1, 1, 1, 1, 5, 1,
       8, 3, 1, 1, 4, 1, 8, 9, 1, 8, 4, 1, 8, 1, 1, 1, 1, 1, 1, 1, 5, 5,
       1, 1, 9, 5, 1, 5])

In [19]:
# from sklearn.model_selection import GridSearchCV
# n_estimators = np.arange(50,200,10)
# max_features = ['auto','sqrt']
# max_depth = [None]
# min_samples_leaf = [1,2,3]
# min_samples_split = [2,4]

# parameters = {'n_estimators':n_estimators,
#               'max_features':max_features,
#               'min_samples_leaf':min_samples_leaf,
#               'min_samples_split':min_samples_split,
#               'max_depth':max_depth}

In [20]:
# clfGridSearch = RandomForestClassifier()
# rf_Grid = GridSearchCV(clfGridSearch,parameters,cv=3,verbose=2,n_jobs=4)

In [21]:
# %%time
# rf_Grid.fit(X_train,y_train.values.ravel())
# print('Best Scores:',rf_Grid.best_score_)
# print('Best Parameters:',rf_Grid.best_params_)