# Handling Imbalanced Classes

In [None]:
# Run this if you don't have this package
#!pip install -U imbalanced-learn

In [None]:
import pandas as pd
import numpy as np
import os

import librosa as l
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
final_df = pd.read_csv('final_df.csv')

# drop first column
final_df.drop('Unnamed: 0', axis='columns', inplace=True)

final_df.head()

Unnamed: 0,Emotions_with_gender,Path,MFCC0,MFCC1,MFCC2,MFCC3,MFCC4,MFCC5,MFCC6,MFCC7,...,MEL125,MEL126,MEL127,TONZ0,TONZ1,TONZ2,TONZ3,TONZ4,TONZ5,Emotions_without_gender
0,13,Ravdess\03-01-01-01-01-01-01.wav,-697.7926,54.89004,0.663466,12.435786,7.733952,0.53075,-3.216631,-3.159394,...,5e-06,4e-06,3.206722e-07,-0.049044,0.020024,-0.018065,-0.064224,0.014611,0.006371,5
1,5,Ravdess\03-01-01-01-01-01-02.wav,-650.7109,54.477303,-9.090127,8.411754,-4.387536,-3.765706,-5.362752,-8.610381,...,6e-06,4e-06,3.992178e-07,-0.019344,0.012139,0.013491,-0.040532,0.006054,0.002813,5
2,13,Ravdess\03-01-01-01-01-01-03.wav,-614.73914,56.70819,-2.685535,10.650176,4.240806,-2.472097,-12.03572,-6.65451,...,7.1e-05,4.5e-05,4.472179e-06,-0.013746,-0.006525,0.013147,-0.001333,0.005258,-0.001753,5
3,5,Ravdess\03-01-01-01-01-01-04.wav,-695.8503,42.934265,-7.274557,8.977729,-4.170579,-4.92489,-6.53796,-12.679187,...,1.3e-05,6e-06,6.443871e-07,-0.006601,0.012613,-0.023542,0.016175,-0.010311,0.000834,5
4,13,Ravdess\03-01-01-01-01-01-05.wav,-713.4335,68.36094,7.989171,15.139791,11.715775,0.430983,1.002558,-2.773119,...,2e-06,1e-06,7.229193e-08,-0.023409,0.016632,-0.042659,0.019653,0.014472,0.010889,5


In [None]:
# train-test split dataset
from sklearn.model_selection import train_test_split

final_df_testing = final_df.copy()

X = final_df_testing.drop(columns=['Path', 'Emotions_without_gender','Emotions_with_gender'], axis=1)
y = final_df_testing['Emotions_with_gender']
print(X.shape, y.shape)


(11682, 173) (11682,)


> How to Balance Data With the Imbalanced-Learn Python Module?

A number of more sophisticated resampling techniques have been proposed in the scientific literature.

For example, we can cluster the records of the majority class and do the under-sampling by removing records from each cluster, thus seeking to preserve information. In over-sampling, instead of creating exact copies of the minority class records, we can introduce small variations into those copies, creating more diverse synthetic samples.

In [None]:
import imblearn

In [None]:
#%pip install --user imblearn

1. Synthetic Minority Oversampling Technique (SMOTE)

This technique generates synthetic data for the minority class.

SMOTE (Synthetic Minority Oversampling Technique) works by randomly picking a point from the minority class and computing the K-Nearest Neighbours (KNN) for this point. 

The synthetic points are added between the chosen point and its neighbors.


In [None]:
# Handle imbalance classes with SMOTE
from imblearn.over_sampling import SMOTE
from collections import Counter

smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

print('Original dataset shape', Counter(y))
print('\n')
print('Resample dataset shape', Counter(y_smote))

Original dataset shape Counter({4: 1096, 6: 1096, 0: 1096, 3: 1096, 2: 1096, 5: 960, 12: 767, 14: 767, 8: 767, 11: 767, 10: 767, 13: 623, 7: 496, 9: 96, 1: 96, 15: 96})


Resample dataset shape Counter({13: 1096, 5: 1096, 9: 1096, 1: 1096, 12: 1096, 4: 1096, 14: 1096, 6: 1096, 8: 1096, 0: 1096, 11: 1096, 3: 1096, 10: 1096, 2: 1096, 15: 1096, 7: 1096})


In [None]:
X_train

Unnamed: 0,MFCC0,MFCC1,MFCC2,MFCC3,MFCC4,MFCC5,MFCC6,MFCC7,MFCC8,MFCC9,...,MEL124,MEL125,MEL126,MEL127,TONZ0,TONZ1,TONZ2,TONZ3,TONZ4,TONZ5
4216,-323.188840,120.134110,-0.029831,36.332390,-11.275234,10.805964,-7.628929,4.438397,-12.600879,3.717046,...,1.041317e-10,1.002161e-10,9.765082e-11,9.610462e-11,0.009157,-0.026378,-0.027347,-0.028386,0.031865,-0.000065
13474,-309.427767,122.824340,5.453744,43.087184,-6.050287,3.312886,-18.949052,1.656981,-10.849359,-0.340586,...,7.643657e-09,7.493269e-09,7.395933e-09,7.335680e-09,0.013579,-0.004659,-0.012582,-0.043438,0.025056,0.014691
2344,-402.697360,133.295210,9.329925,44.645172,-0.694235,17.199997,-4.453174,7.496788,-13.071204,6.258895,...,1.118548e-09,1.100589e-09,1.089251e-09,1.082175e-09,-0.007738,-0.006760,-0.092582,-0.071640,0.012599,-0.003080
15889,-352.861733,126.173793,12.009321,48.134889,-2.535434,14.483966,-17.312315,-1.243245,-11.450712,3.725422,...,2.926382e-09,2.864489e-09,2.824459e-09,2.799784e-09,0.003818,-0.020056,-0.048475,-0.027015,0.013133,0.002957
15538,-385.825889,129.543581,17.437670,53.294446,-5.643975,13.338234,-16.266343,4.644759,-11.373107,-0.587393,...,1.716744e-09,1.686194e-09,1.666640e-09,1.654485e-09,-0.009223,-0.014700,-0.027979,-0.024828,0.018854,0.002780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,-360.848500,25.322237,2.389316,-2.253565,-11.852676,-11.226645,-7.295867,-11.598573,-9.987568,4.051367,...,4.507123e-03,6.961903e-03,6.401034e-03,5.810609e-04,-0.013597,-0.005710,-0.005958,-0.005896,0.003284,-0.004198
11964,-643.930602,43.375405,-0.921632,8.834827,-3.503808,-6.425176,-7.176675,-7.848215,-5.222746,-2.534046,...,1.285590e-04,1.153061e-04,8.323990e-05,6.962403e-06,-0.002555,0.007183,0.021119,-0.028216,0.007377,0.011203
5390,-417.746580,144.542300,5.742746,60.546280,-12.731674,27.855953,-18.925367,7.340115,-12.128677,4.467893,...,5.293744e-10,5.191855e-10,5.126571e-10,5.086162e-10,0.008415,-0.026890,-0.072390,-0.061736,0.020704,0.013746
860,-376.013460,37.209137,-20.613798,7.973230,3.630531,-14.364474,-6.549491,-7.637344,-17.275866,-1.930863,...,1.640957e-02,1.839283e-02,8.359325e-03,6.435438e-04,-0.019943,0.008317,-0.031916,-0.014494,0.004247,-0.002384


2. Random Over-Sampling with imblearn

One way to fight imbalanced data is to generate new samples in the minority classes. 

The most naive strategy is to generate new samples by random sampling with the replacement of the currently available samples. The RandomOverSampler offers such a scheme.

In [None]:
# import library
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)

# fit predictor and target variable
X_ros, y_ros = ros.fit_resample(X, y)

# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_ros, y_ros, test_size=0.2, random_state=42)

print('Original dataset shape', Counter(y))
print('\n')
print('Resample dataset shape', Counter(y_ros))

Original dataset shape Counter({4: 1096, 6: 1096, 0: 1096, 3: 1096, 2: 1096, 5: 960, 12: 767, 14: 767, 8: 767, 11: 767, 10: 767, 13: 623, 7: 496, 9: 96, 1: 96, 15: 96})


Resample dataset shape Counter({13: 1096, 5: 1096, 9: 1096, 1: 1096, 12: 1096, 4: 1096, 14: 1096, 6: 1096, 8: 1096, 0: 1096, 11: 1096, 3: 1096, 10: 1096, 2: 1096, 15: 1096, 7: 1096})


In [None]:
X_train

Unnamed: 0,MFCC0,MFCC1,MFCC2,MFCC3,MFCC4,MFCC5,MFCC6,MFCC7,MFCC8,MFCC9,...,MEL124,MEL125,MEL126,MEL127,TONZ0,TONZ1,TONZ2,TONZ3,TONZ4,TONZ5
4216,-323.18884,120.134110,-0.029831,36.332390,-11.275234,10.805964,-7.628929,4.438397,-12.600879,3.717046,...,1.041317e-10,1.002161e-10,9.765082e-11,9.610462e-11,0.009157,-0.026378,-0.027347,-0.028386,0.031865,-0.000065
13474,-232.05733,106.587940,-31.289646,29.438416,-6.920248,-0.790059,-8.199689,5.632292,-18.137290,-0.410988,...,6.590264e-10,6.491778e-10,6.429625e-10,6.390012e-10,-0.004316,-0.009834,-0.044473,-0.061705,0.017657,0.013565
2344,-402.69736,133.295210,9.329925,44.645172,-0.694235,17.199997,-4.453174,7.496788,-13.071204,6.258895,...,1.118548e-09,1.100589e-09,1.089251e-09,1.082175e-09,-0.007738,-0.006760,-0.092582,-0.071640,0.012599,-0.003080
15889,-407.95074,147.681750,-4.526673,55.816890,-3.070140,21.572863,-11.533528,13.819479,-19.684973,-2.488584,...,3.989871e-09,3.913507e-09,3.864304e-09,3.833858e-09,-0.017550,0.018348,-0.075814,-0.090785,0.017591,-0.012281
15538,-344.89783,120.194480,1.212452,47.359196,0.441234,20.139710,-20.103554,0.283222,-9.962472,-0.421240,...,9.182060e-09,8.992510e-09,8.869655e-09,8.793818e-09,0.003470,0.009708,0.046927,-0.014812,0.004383,-0.010025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,-360.84850,25.322237,2.389316,-2.253565,-11.852676,-11.226645,-7.295867,-11.598573,-9.987568,4.051367,...,4.507123e-03,6.961903e-03,6.401034e-03,5.810609e-04,-0.013597,-0.005710,-0.005958,-0.005896,0.003284,-0.004198
11964,-740.86554,40.050896,-3.229835,3.451206,-3.655160,-5.564048,-11.351340,-13.256773,-9.016317,-6.951468,...,2.665886e-05,3.798963e-05,2.730036e-05,2.603503e-06,-0.022411,0.022289,0.007788,0.083308,-0.002216,0.014698
5390,-417.74658,144.542300,5.742746,60.546280,-12.731674,27.855953,-18.925367,7.340115,-12.128677,4.467893,...,5.293744e-10,5.191855e-10,5.126571e-10,5.086162e-10,0.008415,-0.026890,-0.072390,-0.061736,0.020704,0.013746
860,-376.01346,37.209137,-20.613798,7.973230,3.630531,-14.364474,-6.549491,-7.637344,-17.275866,-1.930863,...,1.640957e-02,1.839283e-02,8.359325e-03,6.435438e-04,-0.019943,0.008317,-0.031916,-0.014494,0.004247,-0.002384


# The difference between SMOTE and Random Over-Sampling with imblearn

1. SMOTE (Synthetic Minority Over-sampling Technique):

>Method:
- SMOTE creates synthetic samples by interpolating between existing minority class instances.
- For each minority class instance, SMOTE selects its k-nearest neighbors and generates synthetic instances along the line segments connecting the instance to its neighbors.
- This results in new instances that lie within the convex hull of the minority class.

>Effect:
- SMOTE introduces diversity to the synthetic samples by creating instances that are not direct duplicates of existing minority class instances.
- It can be particularly effective when the minority class has complex decision boundaries.

2. Random Over-sampling with imblearn:

> Method:
- Random over-sampling involves replicating randomly selected instances from the minority class.
- It creates additional copies of existing minority class instances without considering the relationships between instances.

> Effect:
- Random over-sampling is simpler and quicker to implement compared to SMOTE but may result in duplicated instances that do not introduce much diversity to the minority class.

# Choosing Between SMOTE and Random Over-sampling:

Use SMOTE when you want to introduce diversity to the synthetic samples and create instances that are not direct duplicates of existing minority class instances.

Use random over-sampling when you want a simpler approach and are less concerned about introducing diversity to the synthetic samples.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=0b28ef1e-f6a6-4523-8903-70adcffed1c5' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>