## LOADING AND PREPROCESSING

In [1]:
import numpy as np
import pandas as pd
data = pd.read_csv("/Users/saitejatangudu/Desktop/DATASETS/diabetes.csv")
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
Impute_data = data.drop(['Outcome','Pregnancies'],axis = 1)
columns = Impute_data.columns
columns

Index(['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
       'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [3]:
for i in Impute_data.columns:
    Impute_data[i] = np.where(Impute_data[i]==0,np.nan,Impute_data[i])

In [4]:
for i in data.columns:
    if i in columns:
        data[i]=Impute_data[i]
columns = data.columns
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50.0,1
1,1,85.0,66.0,29.0,,26.6,0.351,31.0,0
2,8,183.0,64.0,,,23.3,0.672,32.0,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1


In [5]:
from sklearn.impute import KNNImputer
Imputer = KNNImputer(n_neighbors=5)
data= pd.DataFrame(Imputer.fit_transform(data))
data.columns=columns
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,169.0,33.6,0.627,50.0,1.0
1,1.0,85.0,66.0,29.0,58.6,26.6,0.351,31.0,0.0
2,8.0,183.0,64.0,25.8,164.6,23.3,0.672,32.0,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1.0


In [6]:
# import module
from sklearn.preprocessing import MinMaxScaler
# scale features
scaler = MinMaxScaler()
model=scaler.fit(data)
data=model.transform(data)

# print scaled features
data = pd.DataFrame(data,columns = columns)
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.352941,0.670968,0.489796,0.304348,0.186298,0.314928,0.234415,0.483333,1.0
1,0.058824,0.264516,0.428571,0.23913,0.053606,0.171779,0.116567,0.166667,0.0
2,0.470588,0.896774,0.408163,0.204348,0.18101,0.104294,0.253629,0.183333,1.0
3,0.058824,0.290323,0.428571,0.173913,0.096154,0.202454,0.038002,0.0,0.0
4,0.0,0.6,0.163265,0.304348,0.185096,0.509202,0.943638,0.2,1.0


## DATA RESAMPLING

### GENERATING TEST DATA

In [7]:
X = data.drop(['Outcome'],axis=1)
y = data.Outcome

In [8]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=42)

In [9]:
print(sum(y_train==1)
,sum(y_train==0)
,sum(y_test==1)
,sum(y_test==0)
     )

228 424 40 76


In [10]:
(sum(y_train==1)+sum(y_test==1))/(sum(y_train==0)+sum(y_test==0))

0.536

In [11]:
x_test.to_csv("pima_test_data.csv",index=False)
y_test.to_csv("pima_test_label.csv",index=False)

### GENERATING EXT_IMB DATASET(05:95)

In [12]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import ADASYN 
from imblearn.under_sampling import RandomUnderSampler

In [13]:
X, y = x_train,y_train
print('Original dataset shape %s' % Counter(y))

Original dataset shape Counter({0.0: 424, 1.0: 228})


In [14]:
no_minority, no_majority = (Counter(y)[1]+Counter(y)[0])*0.05,(Counter(y)[1]+Counter(y)[0])*0.95
ada = ADASYN(random_state=42,sampling_strategy={1:Counter(y)[1],0:round(no_majority)})
X_res, y_res = ada.fit_resample(X, y)
print('Resampled dataset shape After oversampling majority class %s' % Counter(y_res))

rnd = RandomUnderSampler(random_state=42,sampling_strategy={1:round(no_minority),0:Counter(y_res)[0]})
X_final, y_final = rnd.fit_resample(X_res, y_res)
print('Resampled dataset shape with 05:95 ratio%s' % Counter(y_final))

Resampled dataset shape After oversampling majority class Counter({0.0: 604, 1.0: 228})
Resampled dataset shape with 05:95 ratioCounter({0.0: 604, 1.0: 33})




In [15]:
X_final.to_csv("pima_ext_imb_data.csv",index=False)
y_final.to_csv("pima_ext_imb_label.csv",index=False)

### GENERATING 15285_IMB DATASET(15:85)

In [16]:
X, y = x_train,y_train
print('Original dataset shape %s' % Counter(y))

Original dataset shape Counter({0.0: 424, 1.0: 228})


In [17]:
no_minority, no_majority = (Counter(y)[1]+Counter(y)[0])*0.15,(Counter(y)[1]+Counter(y)[0])*0.85
ada = ADASYN(random_state=42,sampling_strategy={1:Counter(y)[1],0:round(no_majority)})
X_res, y_res = ada.fit_resample(X, y)
print('Resampled dataset shape After oversampling majority class %s' % Counter(y_res))

rnd = RandomUnderSampler(random_state=42,sampling_strategy={1:round(no_minority),0:Counter(y_res)[0]})
X_final, y_final = rnd.fit_resample(X_res, y_res)
print('Resampled dataset shape with 05:95 ratio%s' % Counter(y_final))

Resampled dataset shape After oversampling majority class Counter({0.0: 570, 1.0: 228})
Resampled dataset shape with 05:95 ratioCounter({0.0: 570, 1.0: 98})




In [18]:
X_final.to_csv("pima_15285_imb_data.csv",index=False)
y_final.to_csv("pima_15285_imb_label.csv",index=False)

### GENERATING MOD_IMB DATASET(30:70)

In [19]:
X, y = x_train,y_train
print('Original dataset shape %s' % Counter(y))

Original dataset shape Counter({0.0: 424, 1.0: 228})


In [20]:
no_minority, no_majority = (Counter(y)[1]+Counter(y)[0])*0.3,(Counter(y)[1]+Counter(y)[0])*0.7
ada = ADASYN(random_state=42,sampling_strategy={1:Counter(y)[1],0:562})
X_res, y_res = ada.fit_resample(X, y)
print('Resampled dataset shape After oversampling majority class %s' % Counter(y_res))

Resampled dataset shape After oversampling majority class Counter({0.0: 570, 1.0: 228})




In [21]:
X_res.to_csv("pima_mod_imb_data.csv",index=False)
y_res.to_csv("pima_mod_imb_label.csv",index=False)

### GENERATING NO_IMB DATASET(50:50)

In [22]:
X, y = x_train,y_train
print('Original dataset shape %s' % Counter(y))

Original dataset shape Counter({0.0: 424, 1.0: 228})


In [23]:
ada = ADASYN(random_state=42)
X_res, y_res = ada.fit_resample(X, y)
print('Resampled dataset shape with 50:50 %s' % Counter(y_res))

Resampled dataset shape with 50:50 Counter({1.0: 424, 0.0: 424})


In [24]:
X_res.to_csv("pima_no_imb_data.csv",index=False)
y_res.to_csv("pima_no_imb_label.csv",index=False)