## 1. 라이브러리 및 데이터 불러오기

In [None]:
# 코드 필사를 하면서 데이터에 대해 친해집니다!
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor # Random Forest 회귀모델

import warnings
warnings.filterwarnings("ignore")
color = sns.color_palette()

In [None]:
train_df = pd.read_csv("../input/mercedes-benz-greener-manufacturing/train.csv.zip")
test_df = pd.read_csv("../input/mercedes-benz-greener-manufacturing/test.csv.zip")
print(train_df.shape, test_df.shape)

In [None]:
train_df.head()

#### Target feature plot

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(range(train_df.shape[0]), np.sort(train_df.y.values))
plt.xlabel('index', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title("Target Variable: 'y'",fontsize=15)
plt.show()

In [None]:
ulimit = 180
train_df['y'].loc[train_df['y']>ulimit] = ulimit

plt.figure(figsize=(12,8))
sns.distplot(train_df.y.values, bins=50, kde=False)
plt.xlabel('y value', fontsize=12)
plt.title("Histogram of Target Feature",fontsize=15)
plt.show()

In [None]:
print('최소값: {} 최대값: {} 평균값: {} 표준편차: {}'.format(min(train_df['y'].values), max(train_df['y'].values), train_df['y'].values.mean(), train_df['y'].values.std()))
print('180보다 큰 숫자들 개수: {}'.format(np.sum(train_df['y'].values > 180)))

## 2. EDA

- train 데이터에 대한 EDA를 진행합니다.

- 각 feature들이 어떤 특징을 가지고 있는지 확인하고, 분석합니다.

- 데이터와 친해집니다.

In [None]:
dtype_df = train_df.dtypes.reset_index()
dtype_df.columns = ["Count", "Column Type"]
dtype_df.groupby("Column Type").aggregate('count').reset_index()

# object : str ---> categorical feature일 가능성이 높다!

In [None]:
dtype_df.loc[:10,:]

In [None]:
missing_df = train_df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df = missing_df.loc[missing_df['missing_count']>0]
missing_df = missing_df.sort_values(by='missing_count')
missing_df # No missing values

In [None]:
cols = [c for c in train_df.columns if 'X' in c]
print('Number of features: {}'.format(len(cols)))
print('Feature types:')
train_df[cols].dtypes.value_counts()

In [None]:
counts = [[], [], []]
for col in cols:
    _type = train_df[col].dtype
    unique = len(np.unique(train_df[col]))
    if unique == 1:
        counts[0].append(col)
    elif unique == 2 and _type == np.int64:
        counts[1].append(col)
    else:
        counts[2].append(col)
        
print('Feature 값이 1개인 경우 : {} Feature 값이 2개인 경우: {} 범주형 Feature 인 경우: {}\n'.format(*[len(c) for c in counts]))

print('Feature 값이 1개인 경우: ', counts[0])
print('Feature 값이 2개인 경우: ', counts[2])

In [None]:
unique_values_dict = {}
for col in train_df.columns:
    if col not in ["ID", "y", "X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"]:
        unique_value = str(np.sort(train_df[col].unique()).tolist())
        tlist = unique_values_dict.get(unique_value, [])
        tlist.append(col)
        unique_values_dict[unique_value] = tlist
for unique_val, columns in unique_values_dict.items():
    print("컬럼에 존재하는 유일한 값들 : ", unique_val)
    print(columns)
    print("-------------------------------------")

#### Categorical Features

In [None]:
cat_feat = counts[2] # object type으로 된 8개의 columns
train_df[cat_feat].head()

In [None]:
# 357개의 feature를 평균값을 구해서, 오름차순으로 정렬한뒤에
# 순서대로 3토막(119개씩)을 내어서 각 subplot에 내림차순으로 plotting.
binary_means = [np.mean(train_df[col]) for col in counts[1]]
binary_names = np.array(counts[1])[np.argsort(binary_means)]
binary_means = np.sort(binary_means)

fig, ax = plt.subplots(1, 3, figsize=(12, 30))
ax[0].set_ylabel("Feature Name")
ax[1].set_title("Mean value of 2 unique values")
for i in range(3): # 357 / 3
    names, means = binary_names[i*119 : (i+1)*119], binary_means[i*119 : (i+1)*119]
    ax[i].barh(range(len(means)), means, color=color[2])
    ax[i].set_xlabel("Mean value")
    ax[i].set_yticks(range(len(means)))
    ax[i].set_yticklabels(names, rotation='horizontal')
plt.show()

#### Label Encoding --> Ordinal Encoding

In [None]:
for feature in ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"]:
    encoder = LabelEncoder()
    encoder.fit(list(train_df[feature].values))
    train_df[feature] = encoder.transform(list(train_df[feature].values))

In [None]:
train_df.iloc[:, :10] # Ordinal Encoding (in alphabetical order)

In [None]:
X_train = train_df.drop(["ID", "y"], axis=1)
y_train = train_df["y"]

### One-hot Encoding

In [None]:
# One-hot Encoding ---> feature수가 많이 늘어날 수 있다!
# Ordinal Encoding ---> 대신에 의도하지 않은 수치적인 order가 들어갈 수 있다!

train_OHE = pd.get_dummies(data=train_df, columns=["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"])
train_OHE

In [None]:
X_train = train_OHE.drop(["ID", "y"], axis=1)
y_train = train_OHE["y"]

## 3. Training with RandomForest

- Feature Importance라는 값을 확인하기 위해서!

In [None]:
from sklearn import ensemble
model = ensemble.RandomForestRegressor(n_estimators=200, max_depth=10, min_samples_leaf=4, max_features=0.2, n_jobs=-1, random_state=0)
model.fit(X_train, y_train)

#### Feature Importance

- RandomForest 모델이 실제값(target value)을 예측하는 상황에서, 예측에 크게 영향을 주는 변수들을 찾는 방법.

- 어떤 변수(feature)를 Random Forest 모델에 위-아래로 바꿔보면서 성능 편차가 크게 나는 변수는 값을 크게, 그렇지 않은 변수는 값을 작게 부여받는다.

In [None]:
feat_names = X_train.columns.values
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1][:20]

plt.figure(figsize=(12,12))
plt.title("Feature importances")
plt.bar(range(len(indices)), importances[indices], color="r", align="center")
plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical')
plt.xlim([-1, len(indices)])
plt.show()

## 4. Perform dimensionality reduction

1) PCA(Principal Component Analysis)



2) AutoEncoder(최근에 많이 사용하는 딥러닝 기법)

### 4-1. Dimensionality Reduction by PCA

In [None]:
# sklearn에 있는 PCA를 불러옵니다.
from sklearn.decomposition import PCA

X = train_OHE.drop(["ID", "y"], axis=1) # feature matrix (고차원 데이터)
y = train_OHE["y"]

pca = PCA(n_components=10) # 10차원으로 줄이겠다 <-----> 10개의 eigenvector를 뽑아서 저차원 공간으로 표현하겠다.
pca_10 = pca.fit_transform(X)
pca_10

#### PCA를 수행할 때 몇 차원으로 감소시켜야 할까요?


In [None]:
# training data와 test data를 모두 PCA를 이용하여 차원 감소를 수행합니다.
pca = PCA(n_components=0.90) # 원래 데이터의 90%를 보존하는 차원으로 내려주세요.
pca_090 = pca.fit(X) # 학습 및 변환
X_pca_090 = pca_090.transform(X)
X_pca_090.shape

In [None]:
labels = [f"PC{x}" for x in range(1, X_pca_090.shape[1]+1)]

In [None]:
pca_090_variance = np.round(pca_090.explained_variance_ratio_.cumsum()*100, decimals=1)
pca_090_variance

In [None]:
# Scree plot을 그려봅니다.

plt.figure(figsize=(25,5))
plt.bar(x=range(1, len(pca_090_variance)+1), height=pca_090_variance, tick_label=labels) # 완성해서 공유드리겠습니다!

plt.xticks(rotation=90, color='indigo', size=15)
plt.yticks(rotation=0, color='indigo', size=15)
plt.title('Scree Plot',color='tab:orange', fontsize=25)
plt.xlabel('Principal Components', {'color': 'tab:orange', 'fontsize':15})
plt.ylabel('Cumulative percentage of explained variance ', {'color': 'tab:orange', 'fontsize':15})
plt.show()

In [None]:
X_train_pca_df = pd.DataFrame(X_pca_090, columns=labels)
X_train_pca_df

In [None]:
plt.figure(figsize=(12, 12))
plt.title('PCA Plot',color='tab:orange', fontsize=20)
plt.scatter(X_train_pca_df.PC1, X_train_pca_df.PC2)
plt.xticks(rotation=90, color='indigo', size=15)
plt.yticks(rotation=0, color='indigo', size=15)
plt.xlabel('PC1 - {0}%'.format(pca_090_variance[0]), {'color': 'tab:orange', 'fontsize':15});
plt.ylabel('PC2 - {0}%'.format(pca_090_variance[1]), {'color': 'tab:orange', 'fontsize':15});
plt.show()

### 4-2. Dimensionality Reduction by AutoEncoder

(with keras)

In [None]:
from time import time
import os
import pickle

import numpy as np
import tensorflow as tf

from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.layers import Input, Dense, Flatten, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.optimizers import SGD

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
class Autoencoder(Model):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        self.input_dim = input_dim
        self.latent_dim = latent_dim
        self.encoder = tf.keras.Sequential([
            Input(shape=(input_dim, )),
            Dense(latent_dim, activation="relu")
        ])
        self.decoder = tf.keras.Sequential([
            Dense(input_dim, activation='relu')
        ])
        
    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
X.shape

In [None]:
autoencoder = Autoencoder(input_dim=X.shape[1], latent_dim=50)
optimizer = SGD(lr=0.01, momentum=0.9)
autoencoder.compile(optimizer=optimizer, loss='mse')

In [None]:
epochs = 10 # 문제집 x회독 수
batch_size = 32 

if os.path.exists(f"model/benz_AE_model({epochs}epochs).h5"):
    autoencoder.fit(X, X,
                                epochs=1,
                                shuffle=True,
                                batch_size=batch_size,
                                verbose=0,
                                validation_data=(x_test, x_test))
    autoencoder.load_weights(f"model/benz_AE_model({epochs}epochs).h5")
    start = 0.0
    end = 0.0

else:
    start = time()
    fmnist_history = autoencoder.fit(X, X,
                                epochs=epochs,
                                shuffle=True,
                                batch_size=batch_size,
                                verbose=1)
                                #validation_data=(x_test, x_test))

    end = time()

    #autoencoder.save_weights(f"model/benz_AE_model({epochs}epochs).h5")

In [None]:
print("Elapsed Time for Training Deep Autoencoder : %.3f sec." % (end - start))

**References:**
> 1. Dataset and problem statement: https://www.kaggle.com/c/mercedes-benz-greener-manufacturing 
> 2. https://www.kaggle.com/c/mercedes-benz-greener-manufacturing/discussion
> 3. https://medium.com/swlh/greener-manufacturing-with-machine-learning-6ec77d0e7a91
> 4. How to Perform One Hot Encoding for Multi Categorical Variables https://www.youtube.com/watch?v=6WDFfaYtN6s