# Dados desbalanceados

## UnderSampling

Tecnica que consiste na redução do volume de observações da classe com maior frequência

## Oversampling
Tecnica que consiste na geração de dados da classe com maior menor frequência. A geração pode ser através de uma reamostragem ou utilização de dados sinteticos (SMOTE).

In [98]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
import plotly.express as px

In [99]:
df = pd.read_csv('creditcard.csv')

In [100]:
df.shape

(15936, 31)

In [101]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [102]:
df = df.dropna()

In [103]:
df.Class.value_counts(normalize=True)

Class
0.0    0.995419
1.0    0.004581
Name: proportion, dtype: float64

In [104]:
X = df.drop('Class', axis=1)
y = df['Class']

In [105]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [131]:
y_train.value_counts()

Class
0.0    12695
1.0       53
Name: count, dtype: int64

In [130]:
y_test.value_counts()

Class
0.0    3167
1.0      20
Name: count, dtype: int64

In [106]:
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train, y_train)

In [107]:
y_train_oversampled.value_counts()

Class
0.0    12695
1.0    12695
Name: count, dtype: int64

In [108]:
y_train_oversampled.value_counts(normalize=True)

Class
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64

In [109]:
X_train_oversampled[y_train_oversampled==1].drop_duplicates().shape

(53, 30)

In [110]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [111]:
y_train_smote.value_counts(normalize=True)

Class
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64

In [112]:
X_train_smote[y_train_smote==1].drop_duplicates().shape

(12695, 30)

In [113]:
undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train_undersampled, y_train_undersampled = undersampler.fit_resample(X_train, y_train)

In [114]:
y_train_undersampled.value_counts()

Class
0.0    53
1.0    53
Name: count, dtype: int64

In [115]:
model = XGBClassifier()
model.fit(X_train, y_train)

In [116]:
model_oversampled = XGBClassifier()
model_oversampled.fit(X_train_oversampled, y_train_oversampled)

In [117]:
model_undersampled = XGBClassifier()
model_undersampled.fit(X_train_undersampled, y_train_undersampled)

In [118]:
model_smote = XGBClassifier()
model_smote.fit(X_train_smote, y_train_smote)

In [119]:
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      3167
         1.0       0.90      0.95      0.93        20

    accuracy                           1.00      3187
   macro avg       0.95      0.97      0.96      3187
weighted avg       1.00      1.00      1.00      3187



In [120]:
confusion_matrix(y_test, model.predict(X_test))

array([[3165,    2],
       [   1,   19]])

In [121]:
print(classification_report(y_test, model_oversampled.predict(X_test)))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      3167
         1.0       0.90      0.95      0.93        20

    accuracy                           1.00      3187
   macro avg       0.95      0.97      0.96      3187
weighted avg       1.00      1.00      1.00      3187



In [122]:
confusion_matrix(y_test, model_oversampled.predict(X_test))

array([[3165,    2],
       [   1,   19]])

In [123]:
print(classification_report(y_test, model_undersampled.predict(X_test)))

              precision    recall  f1-score   support

         0.0       1.00      0.98      0.99      3167
         1.0       0.20      0.95      0.32        20

    accuracy                           0.98      3187
   macro avg       0.60      0.96      0.66      3187
weighted avg       0.99      0.98      0.98      3187



In [124]:
confusion_matrix(y_test, model_undersampled.predict(X_test))

array([[3089,   78],
       [   1,   19]])

In [125]:
print(classification_report(y_test, model_smote.predict(X_test)))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      3167
         1.0       0.95      0.95      0.95        20

    accuracy                           1.00      3187
   macro avg       0.97      0.97      0.97      3187
weighted avg       1.00      1.00      1.00      3187



In [126]:
confusion_matrix(y_test, model_smote.predict(X_test))

array([[3166,    1],
       [   1,   19]])

In [127]:
px.box(model.predict_proba(X_test)[:, 1], color=y_test)

In [128]:
px.box(model_oversampled.predict_proba(X_test)[:, 1], color=y_test)

In [132]:
px.box(model_smote.predict_proba(X_test)[:, 1], color=y_test)

In [133]:
px.box(model_undersampled.predict_proba(X_test)[:, 1], color=y_test)