In [None]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.gofplots import qqplot
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score
import lightgbm as lgb

In [None]:
train = pd.read_csv('../input/petfinder-pawpularity-score/train.csv')
test = pd.read_csv('../input/petfinder-pawpularity-score/test.csv')
submission = pd.read_csv('../input/petfinder-pawpularity-score/sample_submission.csv')

# EDA

In [None]:
train.head()

In [None]:
train.shape[0] # Train: 9912개의 이미지

In [None]:
train.info()

In [None]:
test.head()

In [None]:
test.shape[0] # Train: 8개의 이미지

In [None]:
submission.head() #id, popularity

In [None]:
sns.histplot(data=train, x="Pawpularity", kde=True)

Pawpularity 2~30에 집중되어 있다.
quantile-quantile diagram으로 분포 정규성을 확인하자.

In [None]:
qqplot(train["Pawpularity"], line='s')

가우시안 분포를 따르지 않는다.
-> 선택한 모델링에 따라 데이터 정규화 작업이 중요하다.

In [None]:
# 각 feature별 countplot
predictor = train.columns[1:-1]
fig = plt.figure(figsize=(15,10))
for i, x in enumerate(predictor):
    ax = plt.subplot(3,4,i+1)
    sns.countplot(data=train, x=x, ax=ax)

각 요소들은 뚜렸한 특징을 가지고 있다.

상관관계를 분석해보자.

In [None]:
corr_matrix = train[predictor].corr()
fig = plt.figure(figsize=(20,15))
sns.set_theme(style="white")
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr_matrix, annot=True, fmt='.1g', cmap=cmap, 
            mask=mask, square=True)
plt.show()

1. Face and Eyes --> 합리적
2. Occlusion and Human (치아의 가지런함과 인간)

독립변수인 1번 요소들이 상관관계가 깊어 (multicollinearity) 모델의 성능을 저하시킬 수 있는지 Scatter plot으로 확인해보자.

In [None]:
sns.scatterplot(x=train['Face'], y=train['Eyes'])

VIF로 확인해보자

In [None]:
vif_data = pd.DataFrame()
X = train[predictor]
vif_data["feature"] = X.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]  
vif_data = vif_data.sort_values("VIF", ascending=False)
vif_data

Face와 Eyes가 10이상이므로 서로 강한 상관관계를 보인다.
이 중 하나만 사용하자.

In [None]:
X.drop("Eyes", axis=1, inplace=True) # EYES 삭제
X.columns

strong linear correlations (Pearson) between the predictor variables and the variable to be predicted (Pawpularity).

In [None]:
for x in X.columns:
    corr_y = round(np.corrcoef(train[x], train["Pawpularity"])[0,1],4)
    print(f"Pawpularity / {x}: {corr_y}")

strong linear correlations은 확인할 수 없다.

모델의 편의성을 위해 Pawpularity를 0과 1 사이로 바꾼다.

In [None]:
ids = train["Id"].values
y = train["Pawpularity"].values
y = y/100

RandomForest를 사용해 feature의 중요도 판단

In [None]:
# GridSearchCV to find the best hyperparameters.
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.3, random_state=42)


print("Train and test split sizes")
print(f"X_train : {X_train.shape}")
print(f"X_test : {X_valid.shape}")
print(f"y_train : {y_train.shape[0]}")
print(f"y_test : {y_valid.shape[0]}")

In [None]:
rfr = RandomForestRegressor(random_state=8)
param_grid = {
            "n_estimators" : [10,50,100],
            "max_features" : ["log2", "sqrt"],
            "max_depth"    : [5,15,25],
            "bootstrap"    : [True, False]
        }

grid_rfr = GridSearchCV(
    rfr,
    param_grid,
    cv = 5,
    verbose=1,
    n_jobs=-1)

best_rfr = grid_rfr.fit(X_train, y_train)

In [None]:
best_rfr

In [None]:
importances = best_rfr.best_estimator_.feature_importances_

feature_names = X_train.columns
forest_importances = pd.DataFrame(importances, columns=["FI"], index=feature_names)
forest_importances = forest_importances.sort_values("FI", ascending=False)

fig, ax = plt.subplots()
sns.barplot(data=forest_importances, x = "FI", 
            y=forest_importances.index, ax=ax, 
            palette="Blues_d")
ax.set_title("Feature importances of RandomForestRegressor", 
             fontsize=20, fontweight='bold')
ax.set_xlabel("Mean decrease in impurity")
ax.set_ylabel("Features")
fig.tight_layout()

Near, Group, Accessory가 중요하다

In [None]:
rfr_pred = best_rfr.predict(X_valid)
fig = plt.figure(figsize=(12,8))
plt.scatter(x=rfr_pred, y=y_valid)
plt.ylabel("Pawpularity real values (y_valid)")
plt.xlabel("Predicted values (rfr_pred)")
plt.title("Predicted Pawpularity VS True values with RandomForest", 
          fontsize=20, fontweight='bold')
plt.show()

랜덤포레스트는 0.35/0.41 지역에 집중된 예측이 매우 불확실 -> 다른 방법을 시도하자

predictor variables and the variable to be predicted

In [None]:
x_train = train.drop(["Id", "Pawpularity", "Eyes"], axis=1)
y_train = train["Pawpularity"]

In [None]:
# overfitting 방지를 위해 train/validation 8:2로 분리
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2)

In [None]:
#우리가 예측할 범위는 (0~무한대)인 특정 값이므로 regressor
lgbr = lgb.LGBMRegressor(max_depth=10,learning_rate=0.005,n_estimators=200,reg_alpha=1,reg_lambda=0.1)
lgbr.fit(x_train, y_train)
y_pred = lgbr.predict(x_test)

In [None]:
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))
print(rmse(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(r2_score(y_test,y_pred))

In [None]:
sub = pd.DataFrame()
sub['Id']=test['Id']

In [None]:
test.head()
# test.info()

In [None]:
test = test.drop(['Id', 'Eyes'], axis=1) # id값은 feature가 아니기에 제거
Pred = lgbr.predict(test)

In [None]:
sub['Pawpularity'] = Pred
sub.to_csv('submission.csv', index=False)

In [None]:
sub.head()