<img src="imagens/logo_oficial.jpg"/>

# Série Fogo no Parquinho: Desbalanceio

#### https://www.kaggle.com/datasets/devansodariya/student-performance-data

### Importações

In [1]:
# Pacotes Python aqui. 
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter  


In [2]:
# # Caso precise instalar a biblioteca imblearn
#!pip install imblearn

## Declarações de Funções

## Leitura de Base de Dados

In [3]:
estudantes = pd.read_csv('bases/student_data.csv')

In [4]:
estudantes.shape

(395, 33)

In [5]:
estudantes.columns

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')

In [6]:
# Visualizar o tipo de cada dado e nome da coluna
estudantes.dtypes

school        object
sex           object
age            int64
address       object
famsize       object
Pstatus       object
Medu           int64
Fedu           int64
Mjob          object
Fjob          object
reason        object
guardian      object
traveltime     int64
studytime      int64
failures       int64
schoolsup     object
famsup        object
paid          object
activities    object
nursery       object
higher        object
internet      object
romantic      object
famrel         int64
freetime       int64
goout          int64
Dalc           int64
Walc           int64
health         int64
absences       int64
G1             int64
G2             int64
G3             int64
dtype: object

In [7]:
# Visualizando alguns registros
estudantes.head(5)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [8]:
# Verificando a existência de valores NULOS em qualquer coluna
estudantes.isnull().sum()

school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64

In [9]:
# Vamos trabalhar apenas com algumas
estudantes = estudantes[['school','sex','age','address','studytime','freetime','absences','G1','G2','G3']]

In [10]:
estudantes.columns

Index(['school', 'sex', 'age', 'address', 'studytime', 'freetime', 'absences',
       'G1', 'G2', 'G3'],
      dtype='object')

In [11]:
estudantes.head(5)

Unnamed: 0,school,sex,age,address,studytime,freetime,absences,G1,G2,G3
0,GP,F,18,U,2,3,6,5,6,6
1,GP,F,17,U,2,3,4,5,5,6
2,GP,F,15,U,2,3,10,7,8,10
3,GP,F,15,U,3,2,2,15,14,15
4,GP,F,16,U,2,3,4,6,10,10


## Divisão entre FEATURES e LABEL

In [13]:
def transformar_label(escola):
    if(escola == 'GP'):
        return 0
    return 1 # MS

In [14]:
estudantes['school'] = estudantes['school'].apply(transformar_label)

In [15]:
estudantes['school']

0      0
1      0
2      0
3      0
4      0
      ..
390    1
391    1
392    1
393    1
394    1
Name: school, Length: 395, dtype: int64

In [16]:
# Prever o school do estudante: GP ou MS.
features_x = estudantes.drop(['school'], axis = 1)
label_y = estudantes['school']

In [17]:
features_x

Unnamed: 0,sex,age,address,studytime,freetime,absences,G1,G2,G3
0,F,18,U,2,3,6,5,6,6
1,F,17,U,2,3,4,5,5,6
2,F,15,U,2,3,10,7,8,10
3,F,15,U,3,2,2,15,14,15
4,F,16,U,2,3,4,6,10,10
...,...,...,...,...,...,...,...,...,...
390,M,20,U,2,5,11,9,9,9
391,M,17,U,1,4,3,14,16,16
392,M,21,R,1,5,3,10,8,7
393,M,18,R,1,4,0,11,12,10


In [18]:
label_y

0      0
1      0
2      0
3      0
4      0
      ..
390    1
391    1
392    1
393    1
394    1
Name: school, Length: 395, dtype: int64

## Tratamento das Variáveis Categóricas e Numéricas

In [19]:
var_categoricas = ['sex','address']
features_x_categoricas = pd.get_dummies(estudantes[var_categoricas])
features_x_categoricas

Unnamed: 0,sex_F,sex_M,address_R,address_U
0,1,0,0,1
1,1,0,0,1
2,1,0,0,1
3,1,0,0,1
4,1,0,0,1
...,...,...,...,...
390,0,1,0,1
391,0,1,0,1
392,0,1,1,0
393,0,1,1,0


In [20]:
var_numericas = ['age','studytime','freetime','absences','G1','G2','G3']
features_x_num = estudantes[var_numericas]
features_x_num

Unnamed: 0,age,studytime,freetime,absences,G1,G2,G3
0,18,2,3,6,5,6,6
1,17,2,3,4,5,5,6
2,15,2,3,10,7,8,10
3,15,3,2,2,15,14,15
4,16,2,3,4,6,10,10
...,...,...,...,...,...,...,...
390,20,2,5,11,9,9,9
391,17,1,4,3,14,16,16
392,21,1,5,3,10,8,7
393,18,1,4,0,11,12,10


In [21]:
# JOIN dos dataframes
estudantes_tratado = pd.concat([features_x_categoricas, features_x_num],axis = 1)
estudantes_tratado

Unnamed: 0,sex_F,sex_M,address_R,address_U,age,studytime,freetime,absences,G1,G2,G3
0,1,0,0,1,18,2,3,6,5,6,6
1,1,0,0,1,17,2,3,4,5,5,6
2,1,0,0,1,15,2,3,10,7,8,10
3,1,0,0,1,15,3,2,2,15,14,15
4,1,0,0,1,16,2,3,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...
390,0,1,0,1,20,2,5,11,9,9,9
391,0,1,0,1,17,1,4,3,14,16,16
392,0,1,1,0,21,1,5,3,10,8,7
393,0,1,1,0,18,1,4,0,11,12,10


In [22]:
label_y

0      0
1      0
2      0
3      0
4      0
      ..
390    1
391    1
392    1
393    1
394    1
Name: school, Length: 395, dtype: int64

## Divisão entre Dados de Treino e Teste

In [23]:
x_train, x_test, y_train, y_teste = train_test_split(estudantes_tratado,label_y,test_size = 0.30,random_state = 7)

In [24]:
print('Tamanho total: ',len(estudantes))
print('Tamanho total do TREINAMENTO: ',len(x_train))
print('Tamanho total do TESTE: ',len(x_test))

Tamanho total:  395
Tamanho total do TREINAMENTO:  276
Tamanho total do TESTE:  119


In [25]:
print('Tamanho total do TREINAMENTO: ',len(x_train))
print('Qtde de REGISTROS 0 (GP): ',len(y_train.loc[lambda x : x == 0]))
print('Qtde de REGISTROS 1 (MS): ',len(y_train.loc[lambda x : x == 1]))

print('Percentual de Qtde de REGISTROS 0 (GP) %: ',(len(y_train.loc[lambda x : x == 0])/len(y_train))*100)
print('Percentual de Qtde de REGISTROS 1 (MS) %: ',(len(y_train.loc[lambda x : x == 1])/len(y_train))*100)

Tamanho total do TREINAMENTO:  276
Qtde de REGISTROS 0 (GP):  246
Qtde de REGISTROS 1 (MS):  30
Percentual de Qtde de REGISTROS 0 (GP) %:  89.13043478260869
Percentual de Qtde de REGISTROS 1 (MS) %:  10.869565217391305


In [26]:
print('Tamanho total do TESTE: ',len(x_test))
print('Qtde de REGISTROS 0 (GP): ',len(y_teste.loc[lambda x : x == 0]))
print('Qtde de REGISTROS 1 (MS): ',len(y_teste.loc[lambda x : x == 1]))

print('Percentual de Qtde de REGISTROS 0 (GP) %: ',(len(y_teste.loc[lambda x : x == 0])/len(y_teste))*100)
print('Percentual de Qtde de REGISTROS 1 (MS) %: ',(len(y_teste.loc[lambda x : x == 1])/len(y_teste))*100)

Tamanho total do TESTE:  119
Qtde de REGISTROS 0 (GP):  103
Qtde de REGISTROS 1 (MS):  16
Percentual de Qtde de REGISTROS 0 (GP) %:  86.5546218487395
Percentual de Qtde de REGISTROS 1 (MS) %:  13.445378151260504


## Treinamento do Algoritmo

In [27]:
clf = RandomForestClassifier(random_state=7)
clf = clf.fit(x_train, y_train)

## Previsões e Avaliação de Resultados

In [28]:
previsoes = clf.predict(x_test)

In [29]:
print('',classification_report(y_teste, previsoes))

               precision    recall  f1-score   support

           0       0.89      0.98      0.94       103
           1       0.67      0.25      0.36        16

    accuracy                           0.88       119
   macro avg       0.78      0.62      0.65       119
weighted avg       0.86      0.88      0.86       119



## Técnica de REAMOSTRAGEM ALEATÓRIA: OVERSAMPLING e UNDERSAMPLING

Obs: aplique o balanceamente somente nos dados de TREINAMENTO, jamais de TESTE. </br> 
<p> OVER-SAMPLING. Vantagem: nenhuma informação é perdida. Desvantagem: o custo computacional aumenta e pode piorar o desempenho do algoritmo para as classes minoritárias. </br> 
<p> UNDER-SAMPLING. Vantagem: reduz o tempo computacional. Desvantagem: descarta informações da classe minoritária o que pode levar uma performance inferior para as predições dela. </br> 

Lembre-se: "NO FREE LUNCH"

## Teste 1: Avaliando o Modelo aplicando OVERSAMPLING

In [30]:
ros = RandomOverSampler() 
# reamostrando X, y 
X_reamostrado, y_reamostrado = ros.fit_resample(x_train,y_train)

In [31]:
Counter(y_reamostrado)

Counter({0: 246, 1: 246})

In [32]:
# Avaliar modelo com a nova distribuição desses dados
clf = RandomForestClassifier(random_state=7)
clf = clf.fit(X_reamostrado, y_reamostrado)

In [33]:
previsoes = clf.predict(x_test)

In [None]:
#0       0.89      0.98      0.94       103
#1       0.67      0.25      0.36        16

In [34]:
print('',classification_report(y_teste, previsoes))

               precision    recall  f1-score   support

           0       0.90      0.96      0.93       103
           1       0.56      0.31      0.40        16

    accuracy                           0.87       119
   macro avg       0.73      0.64      0.66       119
weighted avg       0.85      0.87      0.86       119



## Teste 2: Avaliando o Modelo aplicando UNDERSAMPLE

In [35]:
ros = RandomUnderSampler() 
# reamostrando X, y 
X_reamostrado, y_reamostrado = ros.fit_resample(x_train,y_train)

In [39]:
Counter(y_reamostrado)

Counter({0: 30, 1: 30})

In [36]:
# Avaliar modelo com a nova distribuição desses dados
clf = RandomForestClassifier(random_state=7)
clf = clf.fit(X_reamostrado, y_reamostrado)

In [37]:
previsoes = clf.predict(x_test)

In [None]:
#0       0.89      0.98      0.94       103
#1       0.67      0.25      0.36        16

In [38]:
print('',classification_report(y_teste, previsoes))

               precision    recall  f1-score   support

           0       1.00      0.75      0.86       103
           1       0.38      1.00      0.55        16

    accuracy                           0.78       119
   macro avg       0.69      0.87      0.70       119
weighted avg       0.92      0.78      0.81       119



## Teste 3: Avaliando ambas as técnicas

In [40]:
over = RandomOverSampler(sampling_strategy=0.5) # razão entre num_amostra_classe_min / num_amostra_classe_maj após reamostragem. 
under = RandomUnderSampler(sampling_strategy=0.8) # razão entre num_amostra_classe_min / num_amostra_classe_maj após reamostragem. 

X_reamostrado_over, y_reamostrado_over = over.fit_resample(x_train,y_train)
print(f"Oversampled: {Counter(y_reamostrado_over)}")


Oversampled: Counter({0: 246, 1: 123})


In [41]:
X_combined_sampling, y_combined_sampling = under.fit_resample(X_reamostrado_over, y_reamostrado_over)
print(f"Oversampled: {Counter(y_combined_sampling)}")

Oversampled: Counter({0: 153, 1: 123})


In [42]:
# Avaliar modelo com a nova distribuição desses dados
clf = RandomForestClassifier(random_state=7)
clf = clf.fit(X_combined_sampling, y_combined_sampling)

In [43]:
previsoes = clf.predict(x_test)

In [None]:
#0       1.00      0.75      0.86       103
#1       0.38      1.00      0.55        16

In [44]:
print('',classification_report(y_teste, previsoes))

               precision    recall  f1-score   support

           0       0.92      0.94      0.93       103
           1       0.54      0.44      0.48        16

    accuracy                           0.87       119
   macro avg       0.73      0.69      0.71       119
weighted avg       0.86      0.87      0.87       119

