In [3]:
import numpy as np
import pandas as pd
names = ["Sex", 
"Length",
"Diameter",
"Height",
"whole weight",
"Shucked weight",
"Viscera weight",
"Shell weight",
"Rings"]
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data",
                  names = names)
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [4]:
from collections import Counter
Counter(data["Sex"])

Counter({'M': 1528, 'F': 1307, 'I': 1342})

In [5]:
df_gender = pd.get_dummies(data['Sex'])
data = pd.concat([data, df_gender], axis=1)
data.drop(['Sex'],axis=1,inplace=True)

In [6]:
minority_class = data[data.Rings==18]

In [7]:
majority_class = data[data.Rings==9]

In [8]:
data = pd.concat([minority_class,majority_class],ignore_index=True)

In [9]:
columns=data.columns
data.head()

Unnamed: 0,Length,Diameter,Height,whole weight,Shucked weight,Viscera weight,Shell weight,Rings,F,I,M
0,0.665,0.525,0.165,1.338,0.5515,0.3575,0.35,18,0,0,1
1,0.71,0.54,0.165,1.959,0.7665,0.261,0.78,18,0,0,1
2,0.725,0.56,0.21,2.141,0.65,0.398,1.005,18,1,0,0
3,0.61,0.5,0.24,1.642,0.532,0.3345,0.69,18,0,0,1
4,0.58,0.455,0.155,0.8365,0.315,0.1385,0.32,18,1,0,0


In [10]:
data["Rings"] = np.where(data["Rings"]==18,1,0)

In [11]:
data.head()

Unnamed: 0,Length,Diameter,Height,whole weight,Shucked weight,Viscera weight,Shell weight,Rings,F,I,M
0,0.665,0.525,0.165,1.338,0.5515,0.3575,0.35,1,0,0,1
1,0.71,0.54,0.165,1.959,0.7665,0.261,0.78,1,0,0,1
2,0.725,0.56,0.21,2.141,0.65,0.398,1.005,1,1,0,0
3,0.61,0.5,0.24,1.642,0.532,0.3345,0.69,1,0,0,1
4,0.58,0.455,0.155,0.8365,0.315,0.1385,0.32,1,1,0,0


In [12]:
print(sum(data['Rings']==1),
sum(data['Rings']==0))

42 689


In [13]:
from sklearn.preprocessing import MinMaxScaler

In [14]:
scaler = MinMaxScaler()
model = scaler.fit(data)
data = pd.DataFrame(model.transform(data),columns=columns)
data.head()

Unnamed: 0,Length,Diameter,Height,whole weight,Shucked weight,Viscera weight,Shell weight,Rings,F,I,M
0,0.827957,0.842105,0.666667,0.550193,0.419048,0.641251,0.321244,1.0,0.0,0.0,1.0
1,0.924731,0.881579,0.666667,0.831634,0.597101,0.45259,0.766839,1.0,0.0,0.0,1.0
2,0.956989,0.934211,0.866667,0.914117,0.500621,0.72043,1.0,1.0,1.0,0.0,0.0
3,0.709677,0.776316,1.0,0.687967,0.402899,0.596285,0.673575,1.0,0.0,0.0,1.0
4,0.645161,0.657895,0.622222,0.32291,0.223188,0.213099,0.290155,1.0,1.0,0.0,0.0


### DATA RESAMPLING

### GENERATING TEST DATA

In [15]:
X = data.drop(['Rings'],axis=1)
y = data.Rings

In [16]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=42)

In [17]:
print(sum(y_train==1)
,sum(y_train==0)
,sum(y_test==1)
,sum(y_test==0)
     )

36 585 6 104


In [18]:
(sum(y_train==1)+sum(y_test==1))/(sum(y_train==0)+sum(y_test==0))
     

0.06095791001451379

In [19]:
x_test.to_csv("abalone_test_data.csv",index=False)
y_test.to_csv("abalone_test_label.csv",index=False)

In [20]:
x_train.to_csv("abalone_train_data.csv",index=False)
y_train.to_csv("abalone_train_label.csv",index=False)

### GENERATING EXT_IMB DATASET(05:95)

In [18]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import ADASYN 
from imblearn.under_sampling import RandomUnderSampler

In [19]:
X, y = x_train,y_train
print('Original dataset shape %s' % Counter(y))

Original dataset shape Counter({0.0: 585, 1.0: 36})


In [20]:
no_minority, no_majority = (Counter(y)[1]+Counter(y)[0])*0.05,(Counter(y)[1]+Counter(y)[0])*0.95
ada = ADASYN(random_state=42,sampling_strategy={1:Counter(y)[1],0:741})
X_res, y_res = ada.fit_resample(X, y)
print('Resampled dataset shape After oversampling majority class %s' % Counter(y_res))

Resampled dataset shape After oversampling majority class Counter({0.0: 743, 1.0: 36})




In [21]:
X_res.to_csv("abalone_ext_imb_data.csv",index=False)
y_res.to_csv("abalone_ext_imb_label.csv",index=False)

### GENERATING 15285_IMB DATASET(15:85)

In [22]:
X, y = x_train,y_train
print('Original dataset shape %s' % Counter(y))

Original dataset shape Counter({0.0: 585, 1.0: 36})


In [23]:
no_minority, no_majority = (Counter(y)[1]+Counter(y)[0])*0.15,(Counter(y)[1]+Counter(y)[0])*0.85
ada = ADASYN(random_state=42,sampling_strategy={0:Counter(y)[0],1:round(no_minority)})
X_res, y_res = ada.fit_resample(X, y)
print('Resampled dataset shape After oversampling minority class %s' % Counter(y_res))
rnd = RandomUnderSampler(random_state=42,sampling_strategy={0:round(no_majority),1:Counter(y_res)[1]})
X_final, y_final = rnd.fit_resample(X_res, y_res)
print('Resampled dataset shape with 15:85 ratio%s' % Counter(y_final))

Resampled dataset shape After oversampling minority class Counter({0.0: 585, 1.0: 90})
Resampled dataset shape with 15:85 ratioCounter({0.0: 528, 1.0: 90})


In [24]:
X_final.to_csv("abalone_15285_imb_data.csv",index=False)
y_final.to_csv("abalone_15285_imb_label.csv",index=False)

### GENERATING MOD_IMB DATASET(30:70)

In [25]:
X, y = x_train,y_train
print('Original dataset shape %s' % Counter(y))

Original dataset shape Counter({0.0: 585, 1.0: 36})


In [26]:
no_minority, no_majority = (Counter(y)[1]+Counter(y)[0])*0.3,(Counter(y)[1]+Counter(y)[0])*0.7
ada = ADASYN(random_state=42,sampling_strategy={0:Counter(y)[0],1:round(no_minority)})
X_res, y_res = ada.fit_resample(X, y)
print('Resampled dataset shape After oversampling minority class %s' % Counter(y_res))
rnd = RandomUnderSampler(random_state=42,sampling_strategy={0:round(no_majority),1:Counter(y_res)[1]})
X_final, y_final = rnd.fit_resample(X_res, y_res)
print('Resampled dataset shape with 30:70 ratio%s' % Counter(y_final))

Resampled dataset shape After oversampling minority class Counter({0.0: 585, 1.0: 190})
Resampled dataset shape with 30:70 ratioCounter({0.0: 435, 1.0: 190})


In [27]:
X_final.to_csv("abalone_mod_imb_data.csv",index=False)
y_final.to_csv("abalone_mod_imb_label.csv",index=False)

### GENERATING NO_IMB DATASET(50:50)

In [28]:
X, y = x_train,y_train
print('Original dataset shape %s' % Counter(y))

Original dataset shape Counter({0.0: 585, 1.0: 36})


In [29]:
ada = ADASYN(random_state=42)
X_res, y_res = ada.fit_resample(X, y)
print('Resampled dataset shape with 50:50 %s' % Counter(y_res))

Resampled dataset shape with 50:50 Counter({1.0: 587, 0.0: 585})


In [30]:
X_res.to_csv("abalone_no_imb_data.csv",index=False)
y_res.to_csv("abalone_no_imb_label.csv",index=False)