# How to deal Imbalanced Data set

In [29]:
import pandas as pd

In [30]:
#import file
df = pd.read_csv('diabetes_up_down.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,0
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,0
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,0


In [31]:
df['Outcome'].value_counts()

0    720
1     48
Name: Outcome, dtype: int64

In [34]:
X=df.drop("Outcome",axis=1)
y=df.Outcome

# stratify method

- Some classification problems do not have a balanced number of examples for each class label. As such, it is desirable to split the dataset into train and test sets in a way that preserves the same proportions of examples in each class as observed in the original dataset.

  This is called a stratified train-test split.

In [32]:
from sklearn.model_selection import train_test_split

In [35]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7,stratify = y)

# Upsampling

In [36]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7)

In [37]:
from sklearn.utils import resample

In [38]:
# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

In [39]:
not_dia = X[X.Outcome==0]
diabetic = X[X.Outcome==1]

In [40]:
not_dia.shape

(504, 9)

In [41]:
diabetic.shape

(33, 9)

In [42]:
# upsample minority
dia_upsampled = resample(diabetic,
                          replace=True, # sample with replacement
                          n_samples=len(not_dia), # match number in majority class
                          random_state=27) # reproducible results

In [43]:
# combine majority and upsampled minority
upsampled = pd.concat([not_dia, dia_upsampled])

In [44]:
# check new class counts
upsampled.Outcome.value_counts()

1    504
0    504
Name: Outcome, dtype: int64

# Downsampling

In [45]:
# downsample majority
not_dia_downsampled = resample(not_dia,
                                replace = False, # sample without replacement
                                n_samples = len(diabetic), # match minority n
                                random_state = 27) # reproducible results

In [46]:
# combine minority and downsampled majority
downsampled = pd.concat([not_dia_downsampled, diabetic])

In [47]:
# checking counts
downsampled.Outcome.value_counts()

1    33
0    33
Name: Outcome, dtype: int64

# Over Sampling

In [None]:
from collections import Counter

In [None]:
Counter(y_train)

In [None]:
from imblearn.combine import SMOTE

In [None]:
ove_smp=SMOTETomek(0.70)

X_train_ns,y_train_ns=ove_smp.fit_sample(X_train,y_train)

print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

# Downsampling

In [None]:
from imblearn.under_sampling import NearMiss

In [None]:
ds=NearMiss(0.8)

X_train_ns,y_train_ns=ds.fit_sample(X_train,y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

# Label Encoder

**BRIDGE-TYPE**
- Arch
- Beam
- Truss
- Cantilever
- Tied Arch
- Suspension
- Cable

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

# creating initial dataframe

bridge_types = ('Arch','Beam','Truss','Cantilever','Tied Arch','Suspension','Cable')

bridge_df = pd.DataFrame(bridge_types, columns=['Bridge_Types'])

# creating instance of labelencoder

labelencoder = LabelEncoder()

# Assigning numerical values and storing in another column

bridge_df['Bridge_Types_Cat'] = labelencoder.fit_transform(bridge_df['Bridge_Types'])
bridge_df

# One Hot Encoder

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder

# creating instance of one-hot-encoder
enc = OneHotEncoder(handle_unknown='ignore')

# passing bridge-types-cat column (label encoded values of bridge_types)

enc_df = pd.DataFrame(enc.fit_transform(bridge_df[['Bridge_Types_Cat']]).toarray())

# merge with main df bridge_df on key values
bridge_df = bridge_df.join(enc_df)

bridge_df