In [1]:
import sys
import os 

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) 

In [2]:
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("그림 저장:", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [4]:
train.head()

Unnamed: 0,id,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
0,1,M,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15
1,2,I,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8
2,3,I,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18
3,4,M,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13
4,5,I,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6


- id : 샘플 아이디
- Gender : 전복 성별
- Lenght : 전복 길이
- Diameter : 전복 둘레
- Height : 전복 키
- Whole Weight : 전복 전체 무게
- Shucked Weight : 껍질을 제외한 무게
- Viscra Weight : 내장 무게
- Shell Weight : 껍질 무게
- Target : 전복 나이

In [5]:
train.shape

(1253, 10)

In [6]:
# 결측치 확인
def check_missing_col(dataframe):
    missing_col = []
    counted_missing_col = 0
    for i, col in enumerate(dataframe.columns):
        missing_values = sum(dataframe[col].isna())
        is_missing = True if missing_values >= 1 else False
        if is_missing:
            counted_missing_col += 1
            print (f'결측치가 있는 컬럼은: {col}입니다')
            print (f'해당 컬럼에 총 {missing_values}개의 결측치가 존재합니다')
            missing_col.append([col, dataframe[col].dtype])
        if counted_missing_col == 0:
            print('결측치가 존재하지 않습니다')
        return missing_col

In [7]:
for i, col in enumerate(train.columns):
    print (i, col)

0 id
1 Gender
2 Lenght
3 Diameter
4 Height
5 Whole Weight
6 Shucked Weight
7 Viscra Weight
8 Shell Weight
9 Target


In [8]:
sum(train['id'].isna())

0

In [9]:
missing_col = check_missing_col(train)

결측치가 존재하지 않습니다


In [10]:
train.isnull().sum()

id                0
Gender            0
Lenght            0
Diameter          0
Height            0
Whole Weight      0
Shucked Weight    0
Viscra Weight     0
Shell Weight      0
Target            0
dtype: int64

In [11]:
# id 제거
train = train.drop(columns=['id'],axis=1)

In [None]:
# target 분포 확인
temp = train['Target'].unique()
np.sort(temp) # 오름차순 정렬

In [None]:
train.groupby('Target').count().iloc[:,0]

In [None]:
fig, ax = plt.subplots(1, figsize=(25,10))
sns.countplot('Target', data=train)
plt.title("Abalone age by count", fontsize=30)
plt.xlabel("target(age)", fontsize=15)
plt.ylabel("count", fontsize=15)
save_fig("Abalone age by count")
plt.show();

In [None]:
# 데이터 기초 통계 분석
# 카테고리형 데이터
# 'Gender' 시각화
train['Gender'].unique()

In [None]:
train.groupby('Gender').count().iloc[:,0]

In [None]:
plt.figure(figsize=(8,5))
sns.countplot('Gender', data=train)
plt.title("Abalone gender by count", fontsize=30)
save_fig("Abalone gender by count")
plt.show();

In [None]:
# 성별별로 나이의 분포
plt.figure(figsize=(25, 7))
sns.kdeplot('Target', hue='Gender', data=train)
plt.title('Abalone age by gender', fontsize=30)
plt.xlabel('target(age)', fontsize=15)
plt.ylabel('Density', fontsize=15)
save_fig("Abalone age by gender")
plt.show()

In [None]:
# 수치형 데이터 시각화
description = train.describe()
description

In [None]:
# 수치형 데이터 통계치 그래프
interest_coloumns = train.columns[1:]
# plt.style.use('fivethirtyeight')
fig, ax =plt.subplots(2, 4, figsize = (25, 10))
fig.suptitle('Histogram of interesting features', fontsize=40)
column_idx = 0
for i in range(2):
    for j in range(4):
        ax[i][j].hist(train[interest_coloumns[column_idx]], bins=30, color='#eaa18a', edgecolor='#7bcabf')
        ax[i][j].set_title(interest_coloumns[column_idx])
        ax[i][j].axvline(description[interest_coloumns[column_idx]]['mean'], c='#f55354', label = f"mean = {round(description[interest_coloumns[column_idx]]['mean'], 2)}")
        ax[i][j].axvline(description[interest_coloumns[column_idx]]['50%'], c='#518d7d', label = f"median = {round(description[interest_coloumns[column_idx]]['50%'], 2)}")
        ax[i][j].legend()
        column_idx += 1
save_fig("Histogram of interesting features")

In [None]:
def visualize(axx, yfield):
    sns.regplot(x='Target', y=yfield, data=train,  color='#eaa18a', line_kws=  {'color': '#f55354'} , ax = axx) # regplot을 이용하여 추세선과 산점도 그래프를 그려줍니다.
    axx.set_title(yfield)

In [None]:
figure, ((ax1,ax2,ax3,ax4),(ax5,ax6,ax7,ax8)) = plt.subplots(nrows=2, ncols=4)
figure.set_size_inches(20,12)
figure.suptitle('Correlation between target and features', fontsize=40)

for i in range(len(train.columns[1:-1])):
    visualize(eval(f'ax{i+1}'), train.columns[i+1])
save_fig("Correlation between target and features")

In [None]:
# 수치형 데이터 상관관계 히트맵
train_corr = train.drop(columns=['Gender'], axis=1)
corr = train_corr.corr(method='pearson')

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(data=corr, annot=True, fmt='.2f', linewidths=.5, cmap='Blues')
plt.title('Correlation between features', fontsize=30)
save_fig('Correlation between features')
plt.show()

In [None]:
# target과 피처들의 상관관계
unstacked = corr.unstack()
unstacked_df = pd.DataFrame(unstacked['Target'].sort_values(ascending=False), columns=['Target'])
unstacked_df.style.background_gradient(cmap='viridis')

In [None]:
train

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(x=train['Target'], y=train['Lenght'], hue=train['Gender'])
sns.boxplot(x=train['Target'], y=train['Diameter'], hue=train['Gender'])
sns.boxplot(x=train['Target'], y=train['Height'], hue=train['Gender'])
sns.boxplot(x=train['Target'], y=train['Whole Weight'], hue=train['Gender'])

## Modeling

In [25]:
train.head['Whole Weight'] - ()

Unnamed: 0,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
0,M,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15
1,I,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8
2,I,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18
3,M,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13
4,I,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6


In [27]:
train['water'] = train['Whole Weight'] - (train['Shell Weight'] + train['Shucked Weight'])

In [28]:
test['water'] = test['Whole Weight'] - (test['Shell Weight'] + test['Shucked Weight'])

In [12]:
le = LabelEncoder()
minmax = MinMaxScaler()

In [13]:
standard = StandardScaler()

In [29]:
# labelencoder 
result = le.fit_transform(train['Gender'])
train_new = train.drop(['Gender', 'Target'], 1)
train_new['Gender_encoded'] = result

result = le.transform(test['Gender'])
test_new = test.drop(['id', 'Gender'], 1)
test_new['Gender_encoded'] = result

In [15]:
# # minmaxscaler
# scaled_train_new = minmax.fit_transform(train_new)
# scaled_test_new = minmax.transform(test_new)

In [30]:
# standardscaler
scaled_train_new = standard.fit_transform(train_new)
scaled_test_new = standard.transform(test_new)

In [31]:
# #pca
# from sklearn.decomposition import PCA

# pca = PCA(n_components=5)
# train_low = pca.fit_transform(scaled_train_new)
# test_low = pca.transform(test_new)

In [32]:
X = scaled_train_new
y = train['Target']

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, random_state=42)

In [None]:
# model 생성
model = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu", input_shape=X_train.shape[1:]),
    keras.layers.Dense(1)
])

model.compile(loss="mse", optimizer=keras.optimizers.SGD(learning_rate=1e-3))
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid))
mse_test = model.evaluate(X_test, y_test)
mse_test

In [None]:
y_pred = model.predict(test_low).flatten()

In [None]:
submission['Target'] = np.around(y_pred)
submission['Target'] = submission['Target'].astype('int')
submission.to_csv('data/submission.csv', index=False)

In [33]:
# model 생성
model2 = keras.models.Sequential([
    keras.layers.Dense(60, activation="relu", input_shape=X_train.shape[1:]),
    keras.layers.Dense(1)
])

model2.compile(loss="mse", optimizer=keras.optimizers.SGD(learning_rate=1e-3))
history = model2.fit(X_train, y_train, epochs=50, validation_data=(X_valid, y_valid))
mse_test = model2.evaluate(X_test, y_test)
mse_test

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


4.558427810668945

In [34]:
y_pred = model2.predict(scaled_test_new).flatten()

In [35]:
submission['Target'] = np.around(y_pred)
submission['Target'] = submission['Target'].astype('int')
submission.to_csv('data/submission.csv', index=False)

In [None]:
# 단위가 다르기 때문에 스케일링 진행 -> minmaxscaler
# 우선 모델링한 다음에 ols 진행
# 그 다음에 피처 엔지니어링 
# polynomial
# pca 

In [None]:
# 다중공산성

In [None]:
train_new2 = train.drop(['Target'], 1)
train_new2 = pd.get_dummies(train_new2)

In [None]:
test_new2 = test.drop(['id'], 1)
test_new2 = pd.get_dummies(test_new2)

In [None]:
X = train_new2
y = train['Target']

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, random_state=42)

X_train = minmax.fit_transform(X_train)
X_valid = minmax.transform(X_valid)
X_test = minmax.transform(X_test)

In [None]:
test_new2 = minmax.transform(test_new2)

In [None]:
# model 생성
model = keras.models.Sequential([
    keras.layers.Dense(60, activation="relu", input_shape=X_train.shape[1:]),
    keras.layers.Dense(1)
])

model.compile(loss="mse", optimizer=keras.optimizers.SGD(learning_rate=1e-3))
history = model.fit(X_train, y_train, epochs=50, validation_data=(X_valid, y_valid))
mse_test = model.evaluate(X_test, y_test)
mse_test

In [None]:
y_pred = model.predict(test_new2).flatten()

In [None]:
submission['Target'] = np.around(y_pred)
submission['Target'] = submission['Target'].astype('int')
submission.to_csv('data/submission.csv', index=False)

In [None]:
# 'water' column 추가 + standardscaling + model2