In [17]:
import numpy as np
import pandas as pd
names = ["age","year_of_operation","pos_nodes","class"]
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data",names=names)
data.drop(['year_of_operation'],inplace=True,axis=1)
data.head()

Unnamed: 0,age,pos_nodes,class
0,30,1,1
1,30,3,1
2,30,0,1
3,31,2,1
4,31,4,1


In [18]:
minority_class = data[data["class"]==2]

In [19]:
majority_class = data[data["class"]==1]

In [20]:
data = pd.concat([minority_class,majority_class],ignore_index=True)

In [21]:
data.head()

Unnamed: 0,age,pos_nodes,class
0,34,0,2
1,34,9,2
2,38,21,2
3,39,0,2
4,41,23,2


In [22]:
data["class"] = np.where(data["class"]==2,1,0)

In [23]:
data.head()

Unnamed: 0,age,pos_nodes,class
0,34,0,1
1,34,9,1
2,38,21,1
3,39,0,1
4,41,23,1


In [24]:
print(sum(data['class']==1),
sum(data['class']==0))

81 225


In [25]:
from sklearn.preprocessing import MinMaxScaler

In [26]:
scaler = MinMaxScaler()
model = scaler.fit(data)
data = pd.DataFrame(model.transform(data),columns=["age","pos_nodes","class"])
data.head()

Unnamed: 0,age,pos_nodes,class
0,0.075472,0.0,1.0
1,0.075472,0.173077,1.0
2,0.150943,0.403846,1.0
3,0.169811,0.0,1.0
4,0.207547,0.442308,1.0


### DATA RESAMPLING

### GENERATING TEST DATA

In [27]:
X = data.drop(['class'],axis=1)
y = data['class']

In [28]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=42)

In [29]:
print(sum(y_train==1)
,sum(y_train==0)
,sum(y_test==1)
,sum(y_test==0)
     )

65 195 16 30


In [30]:
(sum(y_train==1)+sum(y_test==1))/(sum(y_train==0)+sum(y_test==0))
     

0.36

In [31]:
x_test.to_csv("haberman_test_data.csv",index=False)
y_test.to_csv("haberman_test_label.csv",index=False)

In [32]:
x_train.to_csv("haberman_train_data.csv",index=False)
y_train.to_csv("haberman_train_label.csv",index=False)

### GENERATING EXT_IMB DATASET(05:95)

In [16]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import ADASYN 
from imblearn.under_sampling import RandomUnderSampler

In [17]:
X, y = x_train,y_train
print('Original dataset shape %s' % Counter(y))

Original dataset shape Counter({0.0: 195, 1.0: 65})


In [18]:
no_minority, no_majority = (Counter(y)[1]+Counter(y)[0])*0.05,(Counter(y)[1]+Counter(y)[0])*0.95
ada = ADASYN(random_state=42,sampling_strategy={1:Counter(y)[1],0:no_majority})
X_res, y_res = ada.fit_resample(X, y)
print('Resampled dataset shape After oversampling majority class %s' % Counter(y_res))

Resampled dataset shape After oversampling majority class Counter({0.0: 234, 1.0: 65})




In [19]:
X_res.to_csv("haberman_ext_imb_data.csv",index=False)
y_res.to_csv("haberman_ext_imb_label.csv",index=False)

### GENERATING 15285_IMB DATASET(15:85)

In [20]:
X, y = x_train,y_train
print('Original dataset shape %s' % Counter(y))

Original dataset shape Counter({0.0: 195, 1.0: 65})


In [21]:
no_minority, no_majority = (Counter(y)[1]+Counter(y)[0])*0.15,(Counter(y)[1]+Counter(y)[0])*0.85
ada = ADASYN(random_state=42,sampling_strategy={1:Counter(y)[1],0:round(no_majority)})
X_res, y_res = ada.fit_resample(X, y)
print('Resampled dataset shape After oversampling minority class %s' % Counter(y_res))
rnd = RandomUnderSampler(random_state=42,sampling_strategy={0:Counter(y_res)[0],1:round(no_minority)})
X_final, y_final = rnd.fit_resample(X_res, y_res)
print('Resampled dataset shape with 15:85 ratio%s' % Counter(y_final))

Resampled dataset shape After oversampling minority class Counter({0.0: 199, 1.0: 65})
Resampled dataset shape with 15:85 ratioCounter({0.0: 199, 1.0: 39})




In [22]:
X_final.to_csv("haberman_15285_imb_data.csv",index=False)
y_final.to_csv("haberman_15285_imb_label.csv",index=False)

### GENERATING MOD_IMB DATASET(30:70)

In [23]:
X, y = x_train,y_train
print('Original dataset shape %s' % Counter(y))

Original dataset shape Counter({0.0: 195, 1.0: 65})


In [24]:
no_minority, no_majority = (Counter(y)[1]+Counter(y)[0])*0.3,(Counter(y)[1]+Counter(y)[0])*0.7
rnd = RandomUnderSampler(random_state=42,sampling_strategy={0:round(no_majority),1:Counter(y_res)[1]})
X_final, y_final = rnd.fit_resample(X_res, y_res)
print('Resampled dataset shape with 30:70 ratio%s' % Counter(y_final))

Resampled dataset shape with 30:70 ratioCounter({0.0: 182, 1.0: 65})


In [25]:
X_final.to_csv("haberman_mod_imb_data.csv",index=False)
y_final.to_csv("haberman_mod_imb_label.csv",index=False)

### GENERATING NO_IMB DATASET(50:50)

In [26]:
X, y = x_train,y_train
print('Original dataset shape %s' % Counter(y))

Original dataset shape Counter({0.0: 195, 1.0: 65})


In [27]:
ada = ADASYN(random_state=42)
X_res, y_res = ada.fit_resample(X, y)
print('Resampled dataset shape with 50:50 %s' % Counter(y_res))

Resampled dataset shape with 50:50 Counter({0.0: 195, 1.0: 190})


In [28]:
X_res.to_csv("haberman_no_imb_data.csv",index=False)
y_res.to_csv("haberman_no_imb_label.csv",index=False)