# Data Loading and Previewing



In [85]:
import pandas as pd

rawData = pd.read_csv('https://raw.githubusercontent.com/skduong/anuran-calls/master/Frogs_MFCCs.csv')
rawData.head()

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,MFCCs_11,MFCCs_12,MFCCs_13,MFCCs_14,MFCCs_15,MFCCs_16,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Family,Genus,Species,RecordID
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,0.188654,-0.075622,-0.156436,0.082245,0.135752,-0.024017,-0.108351,-0.077623,-0.009568,0.057684,0.11868,0.014038,Leptodactylidae,Adenomera,AdenomeraAndre,1
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,0.270958,-0.095004,-0.254341,0.022786,0.16332,0.012022,-0.090974,-0.05651,-0.035303,0.02014,0.082263,0.029056,Leptodactylidae,Adenomera,AdenomeraAndre,1
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,0.266064,-0.072827,-0.237384,0.050791,0.207338,0.083536,-0.050691,-0.02359,-0.066722,-0.025083,0.099108,0.077162,Leptodactylidae,Adenomera,AdenomeraAndre,1
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,0.267279,-0.162258,-0.317084,-0.011567,0.100413,-0.050224,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954,Leptodactylidae,Adenomera,AdenomeraAndre,1
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.1727,0.266434,0.332695,-0.100749,-0.298524,0.037439,0.219153,0.062837,-0.048885,-0.053074,-0.08855,-0.031346,0.10861,0.079244,Leptodactylidae,Adenomera,AdenomeraAndre,1


In [86]:
data = rawData[rawData.columns[:23]] #focus on Family classification
data['Family'].value_counts() #get class balance

Leptodactylidae    4420
Hylidae            2165
Dendrobatidae       542
Bufonidae            68
Name: Family, dtype: int64

Rebalancing is needed: undersample Leptodactylidae, oversamle Dendrobatidae with SMOTE, remove Bufondidae

In [87]:
import numpy as np
from imblearn.over_sampling import SMOTE

lepto = data.loc[data['Family'] == 'Leptodactylidae']
hyli = data.loc[data['Family'] == 'Hylidae']
dendro = data.loc[data['Family'] == 'Dendrobatidae']

#oversample dendro classes to match dendro
subset = pd.concat([hyli, dendro])
X = subset.drop(['Family'], 1)
y = subset['Family']
oversample = SMOTE()
X, y = oversample.fit_resample(X, y) #X = data matrix, y = labels
oversample_df = pd.concat([pd.DataFrame(X), pd.DataFrame(y)], 1)
oversample_df.columns = lepto.columns.values
#pd.Series(y).value_counts()

#randomly undersample lepto by 1420 cases
undersample = lepto.drop(np.random.choice(lepto.index, 1420, False))

#rebalanced data
data = pd.concat([undersample, oversample_df])
data['Family'].value_counts()



Leptodactylidae    3000
Dendrobatidae      2165
Hylidae            2165
Name: Family, dtype: int64

Improved balanced compared to before. Not perfect, but it's a compromise between losing too many Leptodactylidae cases and having heavy imbalance with the major class.

In [88]:
data.describe()

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,MFCCs_11,MFCCs_12,MFCCs_13,MFCCs_14,MFCCs_15,MFCCs_16,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22
count,7330.0,7330.0,7330.0,7330.0,7330.0,7330.0,7330.0,7330.0,7330.0,7330.0,7330.0,7330.0,7330.0,7330.0,7330.0,7330.0,7330.0,7330.0,7330.0,7330.0,7330.0,7330.0
mean,0.991045,0.319733,0.256475,0.39396,0.152852,0.122001,0.013395,-0.014618,0.084981,0.058318,-0.067914,0.008354,0.083971,0.005654,-0.037758,0.007641,0.030375,0.011087,-0.021658,-0.03841,0.020624,0.057505
std,0.064298,0.208382,0.280706,0.172009,0.172614,0.116889,0.181873,0.119312,0.185871,0.141335,0.18094,0.164276,0.207888,0.162309,0.191475,0.124927,0.151399,0.081136,0.086059,0.08892,0.079459,0.116603
min,-0.251179,-0.673025,-0.436028,-0.472676,-0.539779,-0.410417,-0.452555,-0.501465,-0.51346,-0.952266,-0.901989,-0.799441,-0.644116,-0.59038,-0.717156,-0.498675,-0.405825,-0.759322,-0.680745,-0.361649,-0.430812,-0.375461
25%,1.0,0.185027,0.042899,0.28757,0.062082,0.035193,-0.125782,-0.103382,-0.054405,-0.003163,-0.228529,-0.09915,-0.074867,-0.100792,-0.208734,-0.084383,-0.117474,-0.035657,-0.080865,-0.09601,-0.034753,-0.021585
50%,1.0,0.303629,0.227453,0.392531,0.164039,0.115156,-0.024104,-0.00976,0.131387,0.066891,-0.065091,0.02365,0.072462,-0.016921,-0.048727,0.015535,0.036537,0.01334,-0.013006,-0.038572,0.009706,0.038222
75%,1.0,0.431324,0.395358,0.531132,0.243181,0.198412,0.107384,0.063583,0.235975,0.137497,0.063058,0.099702,0.276193,0.128914,0.138522,0.078673,0.167611,0.061971,0.029016,0.01252,0.072029,0.1649
max,1.0,1.0,1.0,1.0,0.752246,0.96424,0.696887,0.551762,0.738033,0.522768,0.523033,0.66947,0.94571,0.566873,0.668924,0.6707,0.451186,0.614064,0.574209,0.467831,0.387363,0.432207
