In [1]:
import pandas as pd
import numpy as np
import scipy.stats as sp
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# remember to change the path!

df_train = pd.read_csv("RavdessAudioOnlyFeatures_TRAIN.csv")
df_test = pd.read_csv("RavdessAudioOnlyFeatures_TEST.csv")

In [3]:
df_train.head()

Unnamed: 0,modality,vocal_channel,emotion,emotional_intensity,statement,repetition,actor,sex,filename,frame_count,...,stft_max_w4,stft_q01_w4,stft_q05_w4,stft_q25_w4,stft_q50_w4,stft_q75_w4,stft_q95_w4,stft_q99_w4,stft_kur_w4,stft_skew_w4
0,audio-only,speech,neutral,normal,Kids are talking by the door,1st,1,M,03-01-01-01-01-01-01.wav,158558,...,1.0,0.0,0.566462,0.709962,0.799141,0.896606,1.0,1.0,6.74219,-1.706215
1,audio-only,speech,neutral,normal,Kids are talking by the door,2nd,1,M,03-01-01-01-01-02-01.wav,160160,...,1.0,0.368623,0.472736,0.623183,0.744908,0.874713,1.0,1.0,-0.70042,-0.201495
2,audio-only,speech,neutral,normal,Dogs are sitting by the door,1st,1,M,03-01-01-01-02-01-01.wav,156956,...,1.0,0.0,0.417919,0.643636,0.774253,0.899156,1.0,1.0,1.688986,-1.024773
3,audio-only,speech,neutral,normal,Dogs are sitting by the door,2nd,1,M,03-01-01-01-02-02-01.wav,152152,...,1.0,0.30628,0.399641,0.60691,0.755213,0.886474,1.0,1.0,-0.594111,-0.412871
4,audio-only,speech,calm,normal,Kids are talking by the door,1st,1,M,03-01-02-01-01-01-01.wav,169769,...,1.0,0.248765,0.428202,0.634815,0.759914,0.878014,1.0,1.0,0.126535,-0.620782


In [4]:
def spearman_selector(target, df, k):
    """Returns a df comprising all the features highly correlated (above k and below -k)
    with the binary target according Spearman coefficient. Target must be a pd Series"""
    df = df.select_dtypes(include="number")
    target = target.map({target.unique()[0]:0, target.unique()[1]:1})
    new_df = pd.DataFrame()
    for col in df:
        spearman = sp.spearmanr(df[col], target)[0]
        if spearman > k or spearman < -k:
            new_df[col] = df[col]
    return new_df 

In [6]:
df_sex = spearman_selector(df_train["sex"], df_train, 0.5)

In [8]:
#aggiungere la colonna sex al dataframe
df_sex["sex"] = df_train["sex"]

In [9]:
X = df_sex.drop(['sex'], axis = 1)
y = np.array(df_sex[['sex']])

In [10]:
np.unique(y, return_counts=True)

(array(['F', 'M'], dtype=object), array([892, 936]))

#### Vogliamo che la minority class (F) sia il 96%. Avendo 892 casi di F, di conseguenza il totale è pari a 929. Quindi la majority class (M) sarà rappresentata al 4% e avrà 37 casi

In [11]:
rows_remove = np.random.choice(df_sex[y == 'M'].index, 936-37, replace=False)

In [12]:
df2_sex = df_sex.drop(index=rows_remove, axis=0)
print(len(df2_sex))

929


In [13]:
X2 = df2_sex.loc[:, df2_sex.columns != 'sex']
y2 = np.array([y[i] for i in range(len(y)) if i not in rows_remove])

In [14]:
np.unique(y2, return_counts=True)

(array(['F', 'M'], dtype=object), array([892,  37]))

In [15]:
df2_sex.shape

(929, 45)

#### Train and Test dataset for application of oversampling

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size = 0.25, random_state=42)

#### Procediamo ad aumentare le istanze della majority class (M) in modo tale quando applichiamo i metodi di undersampling non avremo classi contenenti meno di 100 valori, in quanto troppo pocchi per fare il training di un modello

In [18]:
#individuare le righe da aggiungere per avere il dataset sbilanciato per i metodi di undersampling
rows_add = np.random.choice(df_sex[y == 'M'].index, 936*8)
rows_add_frame = df_sex.loc[rows_add]

In [19]:
df3_sex = df_sex.append(rows_add_frame)

In [20]:
X3 = df3_sex.loc[:, df3_sex.columns != 'sex']
y3 = np.array(df3_sex.loc[:, df3_sex.columns == 'sex'])

In [21]:
np.unique(y3, return_counts=True)

(array(['F', 'M'], dtype=object), array([ 892, 8424]))

#### Train and Test dataset for application of undersampling