### Import Package

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### Import Data

In [21]:
df = pd.read_csv("data_abalone.csv", index_col='Unnamed: 0')
df.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


mengambil index kolom 'Unnamed: 0' sebagai index pada dataset

### Melihat jumlah baris dan kolom

In [22]:
df.shape

(4177, 9)

terdapat 4177 baris dan 9 kolom<br>
terdapat 8 kolom fitur dan 1 kolom target 

### Melihat data yang kosong

In [23]:
df.replace("?", np.nan, inplace = True)
df.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [24]:
df.isnull().any()

sex               False
length            False
diameter          False
height            False
whole_weight      False
shucked_weight    False
viscera_weight    False
shell_weight      False
rings             False
dtype: bool

In [25]:
df.isnull().sum()

sex               0
length            0
diameter          0
height            0
whole_weight      0
shucked_weight    0
viscera_weight    0
shell_weight      0
rings             0
dtype: int64

terdapat data yang kosong pada kolom horsepower

### Melihat tipe data

In [26]:
df.dtypes

sex                object
length            float64
diameter          float64
height            float64
whole_weight      float64
shucked_weight    float64
viscera_weight    float64
shell_weight      float64
rings               int64
dtype: object

### Merubah tipe data sesuai dengan kolom dan deskripsi

In [27]:
df.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


karena tipe data sudah sesuai maka melanjutkan proses encoding

tipe data pada setiap kolom sudah sesuai dengan deskripsi

### Melihat Statistik Kolom Numerik

In [28]:
df.describe()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


### Mengisi data yang kosong

In [29]:
df.isnull().any()

sex               False
length            False
diameter          False
height            False
whole_weight      False
shucked_weight    False
viscera_weight    False
shell_weight      False
rings             False
dtype: bool

sudah tidak terdapat data yang kosong pada masing-masing kolom

### Melihat Duplikasi Data

In [30]:
df.duplicated().sum()

0

tidak terdapat data yg duplikat

### Melakukan Label Encoding pada kolom kategorikal

In [31]:
df['sex'].value_counts()

M    1528
I    1342
F    1307
Name: sex, dtype: int64

In [32]:
from sklearn.preprocessing import LabelEncoder

encoding = LabelEncoder()
df['sex'] = encoding.fit_transform(df['sex'])

df.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,2,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,2,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,2,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,1,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


melakukan proses encoding untuk data fitur berupa teks ataupun kategorikal<br>
agar dapat diproses oleh mesin serta memudahkan dalam melakukan labeling pada data kategorikal

### Melihat tipe data setiap kolom

In [33]:
df.dtypes

sex                 int32
length            float64
diameter          float64
height            float64
whole_weight      float64
shucked_weight    float64
viscera_weight    float64
shell_weight      float64
rings               int64
dtype: object

tipe data sudah sesuai untuk semua kolom

### Melakukan Scaling menggunakan MinMax Scaler untuk kolom numerik

In [34]:
df.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,2,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,2,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,2,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,1,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [35]:
from sklearn.preprocessing import MinMaxScaler
numeric = ["length", "diameter", "height", "whole_weight", 'shucked_weight', 'viscera_weight', 'shell_weight']
scaler = MinMaxScaler(feature_range=(0, 1))

df_1 = df.copy()

scaler.fit(df_1.loc[:, df_1[numeric].columns])
df_1.loc[:, df_1[numeric].columns] = scaler.transform(df_1.loc[:, df_1[numeric].columns])
df_1.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,2,0.513514,0.521008,0.084071,0.181335,0.150303,0.132324,0.147982,15
1,2,0.371622,0.352941,0.079646,0.079157,0.066241,0.063199,0.068261,7
2,0,0.614865,0.613445,0.119469,0.239065,0.171822,0.185648,0.207773,9
3,2,0.493243,0.521008,0.110619,0.182044,0.14425,0.14944,0.152965,10
4,1,0.344595,0.336134,0.070796,0.071897,0.059516,0.05135,0.053313,7


melakukan normalisasi minmax untuk semua kolom fitur numerikal 

### Dataset Splitting

In [36]:
X = df_1.drop(columns=["rings"])
y = df_1["rings"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

tidak memakai stratify karena bukan merupakan klasifikasi dan kolom target berupa numerikal<br>
data training 80%, data testing 20%

### Fitting Dataset menggunakan Linear Regression

In [37]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

print('Training Score\t= ', model.score(X_train, y_train), '\nTesting Score\t= ', model.score(X_test, y_test))

Training Score	=  0.5266943382158247 
Testing Score	=  0.5300147524184928


### Melihat nilai Evaluasi data

In [38]:
hasil = model.predict(X_test)
from sklearn.metrics import mean_squared_error, r2_score

print('MSE\t= ', mean_squared_error(y_test, hasil), '\nR2\t= ', r2_score(y_test, hasil))

MSE	=  5.104186010193347 
R2	=  0.5300147524184928


### Menggunakan Column Transformer

In [54]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

numeric = ["length", "diameter", "height", "whole_weight", 'shucked_weight', 'viscera_weight', 'shell_weight']

df_2 = df.copy()

preprocessor = ColumnTransformer([
    ('numeric', MinMaxScaler(feature_range=(0, 1)), numeric),
])

In [45]:
parameter = {
    'algo__fit_intercept': [True, False],
}

In [56]:
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', LinearRegression()),
])

model_linreg = GridSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1)
model_linreg.fit(X_train, y_train)

print(model_linreg.best_params_)
print(model_linreg.score(X_train, y_train), model_linreg.best_score_, model_linreg.score(X_test, y_test))

Fitting 3 folds for each of 2 candidates, totalling 6 fits
{'algo__fit_intercept': True}
0.5263511332802138 0.5152170944550403 0.530088810705066


In [57]:
hasil = model_linreg.predict(X_test)
from sklearn.metrics import mean_squared_error, r2_score

print('MSE\t= ', mean_squared_error(y_test, hasil), '\nR2\t= ', r2_score(y_test, hasil))

MSE	=  5.103381714160203 
R2	=  0.530088810705066


### Prediksi Data

In [58]:
df.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,2,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,2,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,2,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,1,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [62]:
pred_rings_1 = {
    'sex': ['2'],
    'length': [0.455],
    'diameter': [0.365],
    'height': [0.095],
    'whole_weight': [0.5140],
    'shucked_weight': [0.2245],
    'viscera_weight': [0.1010],
    'shell_weight': [0.150],
}

pred_rings_2 = {
    'sex': ['2'],
    'length': [0.350],
    'diameter': [0.265],
    'height': [0.090],
    'whole_weight': [0.2255],
    'shucked_weight': [0.0995],
    'viscera_weight': [0.0485],
    'shell_weight': [0.070],
}

In [64]:
def manual_testing(rings):
    df_pred = pd.DataFrame(rings)
    pred_linreg = model_linreg.predict(df_pred)

    return print('Predicted Rings\t= ', pred_linreg)

In [63]:
manual_testing(pred_rings_1)

Predicted Rings	=  [14.81424294]


In [65]:
manual_testing(pred_rings_2)

Predicted Rings	=  [9.76647638]
