In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [2]:
data = pd.read_csv('Dry_Bean_Dataset.csv')
data.head()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,SEKER
1,28734,638.018,200.524796,182.734419,1.097356,0.411785,29172,191.272751,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843,SEKER
2,29380,624.11,212.82613,175.931143,1.209713,0.562727,29690,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,SEKER
3,30008,645.884,210.557999,182.516516,1.153638,0.498616,30724,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,SEKER
4,30140,620.134,201.847882,190.279279,1.060798,0.33368,30417,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.9419,0.999166,SEKER


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13611 entries, 0 to 13610
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area             13611 non-null  int64  
 1   Perimeter        13611 non-null  float64
 2   MajorAxisLength  13611 non-null  float64
 3   MinorAxisLength  13611 non-null  float64
 4   AspectRation     13611 non-null  float64
 5   Eccentricity     13611 non-null  float64
 6   ConvexArea       13611 non-null  int64  
 7   EquivDiameter    13611 non-null  float64
 8   Extent           13611 non-null  float64
 9   Solidity         13611 non-null  float64
 10  roundness        13611 non-null  float64
 11  Compactness      13611 non-null  float64
 12  ShapeFactor1     13611 non-null  float64
 13  ShapeFactor2     13611 non-null  float64
 14  ShapeFactor3     13611 non-null  float64
 15  ShapeFactor4     13611 non-null  float64
 16  Class            13611 non-null  object 
dtypes: float64(1

In [4]:
data["Class"].value_counts()

Class
DERMASON    3546
SIRA        2636
SEKER       2027
HOROZ       1928
CALI        1630
BARBUNYA    1322
BOMBAY       522
Name: count, dtype: int64

In [5]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["Class"]):
 strat_train_set = data.loc[train_index]
 strat_test_set = data.loc[test_index]
beans = strat_train_set.copy()

In [6]:
le = LabelEncoder()
beans["Class"] = le.fit_transform(beans["Class"])
beans["Class"]

1453    5
6470    4
4639    2
4655    2
1936    5
       ..
2172    0
5291    2
2477    0
6431    4
3600    1
Name: Class, Length: 10888, dtype: int64

In [7]:
corr_matrix = beans.corr()
corr_matrix["Class"].sort_values(ascending=False)

Class              1.000000
ShapeFactor1       0.389957
roundness          0.385472
ShapeFactor2       0.334449
Solidity           0.323563
ShapeFactor3       0.168400
ShapeFactor4       0.163410
Compactness        0.156856
Extent            -0.031167
AspectRation      -0.116804
Eccentricity      -0.201311
MajorAxisLength   -0.454153
MinorAxisLength   -0.456568
Area              -0.474206
ConvexArea        -0.476492
EquivDiameter     -0.479618
Perimeter         -0.506347
Name: Class, dtype: float64

In [11]:
min_max = MinMaxScaler(feature_range=(-1, 1))
beans_scaled = min_max.fit_transform(beans)
beans_scaled_df = pd.DataFrame(beans_scaled)
beans_scaled_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
count,10888.0,10888.0,10888.0,10888.0,10888.0,10888.0,10888.0,10888.0,10888.0,10888.0,10888.0,10888.0,10888.0,10888.0,10888.0,10888.0,10888.0
mean,-0.714057,-0.518252,-0.497223,-0.527911,-0.205519,0.536356,-0.713073,-0.542572,0.248863,0.799912,0.531526,-0.081118,-0.012898,-0.257871,-0.173429,0.820094,0.177566
std,0.256911,0.312252,0.315436,0.266335,0.351117,0.265397,0.258114,0.294868,0.315524,0.12442,0.237653,0.355854,0.293833,0.38499,0.350587,0.168462,0.609556
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,-0.860494,-0.739201,-0.743314,-0.684897,-0.419196,0.436334,-0.860899,-0.731719,0.048571,0.761069,0.36664,-0.296019,-0.185188,-0.620526,-0.393183,0.768229,-0.333333
50%,-0.787328,-0.605714,-0.581668,-0.585875,-0.251516,0.5752,-0.787112,-0.614976,0.312563,0.830157,0.57061,-0.072839,0.007899,-0.274078,-0.178771,0.870772,0.0
75%,-0.640879,-0.340435,-0.290889,-0.441732,-0.030709,0.707626,-0.638115,-0.410379,0.487362,0.876272,0.705156,0.115371,0.172142,0.035489,0.010292,0.928503,0.666667
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
