# 1. Import data and library

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import optuna

In [None]:
train = pd.read_csv("../input/petfinder-pawpularity-score/train.csv")
test = pd.read_csv("../input/petfinder-pawpularity-score/test.csv")

In [None]:
train.shape, test.shape

# EDA

In [None]:
train.head()

In [None]:
train.isnull().sum()

In [None]:
test.head()

# Distribution of the target variable

In [None]:
train['Pawpularity'].hist()

In [None]:
from sklearn.preprocessing import PowerTransformer

power = PowerTransformer(method='box-cox')
x = power.fit_transform(train[['Pawpularity']]).flatten()
pd.Series(x).hist()

# Feature Engineering

In [None]:
# 1. A picture with both eyes and face would be considered as cuter
train['eye_face'] = train['Eyes'] * train['Face']
test['eye_face'] = test['Eyes'] * test['Face']

In [None]:
# 2. A picture in which pets in a group feels to be nereby would be considered as cuter
train['near_group'] = train['Near'] * train['Group']
test['near_group'] = test['Near'] * test['Group']

# Data Preparation and Model Selection

In [None]:
X_train = train.drop(['Id', 'Pawpularity'], axis=1)
y_train = train['Pawpularity']
test_id = test['Id']
test = test.drop(['Id'], axis=1)

# transform y_train
power = PowerTransformer()
y_trans = pd.Series(power.fit_transform(train[['Pawpularity']]).flatten())

In [None]:
X_train

In [None]:
cat_features = list(X_train.columns)

In [None]:
lr = LinearRegression()
enet = ElasticNet()
rf = RandomForestRegressor()
ada = AdaBoostRegressor()
gbr = GradientBoostingRegressor()
lgbm = LGBMRegressor()
xgb = XGBRegressor()
cat = CatBoostRegressor()

In [None]:
models = [lr, enet, rf, ada, gbr, lgbm, xgb, cat]
fold = KFold(n_splits=5, shuffle=True, random_state=42)

for model in models:
    scores = []
    name = model.__class__.__name__
    scores = cross_val_score(model, X=X_train, y=y_train, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
    mse = (-1) * np.mean(scores)
    
    print('Model %s - RMSE: %.4f'%(name, np.sqrt(mse)))

# Optimization

In [None]:
# LGBM, Ada, GBR
## ElasticNet Optimization
from optuna.samplers import TPESampler

def objective(trial):
    param = {
        'alpha': trial.suggest_loguniform("alpha", 0.5, 2.0),
        'l1_ratio': trial.suggest_uniform('l1_ratio', 0.0, 1.0),
        'max_iter': trial.suggest_int('max_iter', 500, 2000),
        'random_state': 42
    }
    
    model = ElasticNet(**param)
    scores = cross_val_score(model, X=X_train, y=y_train, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
    mse = (-1) * np.mean(scores)
    rmse = np.sqrt(mse)
    
    return rmse

enet_study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))
enet_study.optimize(objective, n_trials=200)

enet_best = enet_study.best_trial
enet_best_params = enet_best.params
print('score: {0}, params: {1}'.format(enet_best.value, enet_best_params))

In [None]:
## Gradient Boosting
def objective(trial):
    param = {
      'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 3, 13),
      'max_depth': trial.suggest_int('max_depth', 3, 13),
      'learning_rate': trial.suggest_loguniform("learning_rate", 1e-07, 0.5),
      'n_estimators': trial.suggest_int('n_estimators', 100, 4000),
      'min_samples_split': trial.suggest_int('min_samples_split', 2, 13),
      'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 13),
      'random_state': 42
    }
    
    model = GradientBoostingRegressor(**param)
    scores = cross_val_score(model, X=X_train, y=y_train, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
    mse = (-1) * np.mean(scores)
    rmse = np.sqrt(mse)
    
    return rmse

gbc_study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))
gbc_study.optimize(objective, n_trials=200)

gbc_best = gbc_study.best_trial
gbc_best_params = gbc_best.params
print('score: {0}, params: {1}'.format(gbc_best.value, gbc_best_params))

In [None]:
## LGBM Boosting
def objective(trial):
    param = {
      'objective': 'regression',
      'n_jobs': -1,
      'num_leaves': trial.suggest_int('num_leaves', 15, 45),
      'max_depth': trial.suggest_int('max_depth', 3, 15),
      'learning_rate': trial.suggest_loguniform("learning_rate", 1e-07, 0.5),
      'n_estimators': trial.suggest_int('n_estimators', 300, 4000),
      'min_child_samples': trial.suggest_int('min_child_samples', 2, 20),
      'random_state': 42
  }

    model = LGBMRegressor(**param)
    scores = cross_val_score(model, X=X_train, y=y_train, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
    mse = (-1) * np.mean(scores)
    rmse = np.sqrt(mse)
    
    return rmse

lgbm_study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))
lgbm_study.optimize(objective, n_trials=200)

lgbm_best = lgbm_study.best_trial
lgbm_best_params = lgbm_best.params
print('score: {0}, params: {1}'.format(lgbm_best.value, lgbm_best_params))

In [None]:
enet_final = ElasticNet(**enet_best_params)
gbr_final = GradientBoostingRegressor(**gbc_best_params)
lgbm_final = LGBMRegressor(**lgbm_best_params)

In [None]:
# ElasticNet
scores = []
name = enet_final.__class__.__name__
for train_idx, val_idx in fold.split(X_train):
    train_x, val_x = X_train.loc[train_idx, :], X_train.loc[val_idx]
    train_y, val_y = y_train.loc[train_idx], y_train.loc[val_idx]

    enet_final.fit(train_x, train_y)

    val_pred = enet_final.predict(val_x)
    rmse = np.sqrt(mean_squared_error(val_y, val_pred))
    scores.append(rmse)

print('%s - RMSE: %.4f' % (name, np.mean(scores)))

In [None]:
# Gradient Boosting
scores = []
name = gbr_final.__class__.__name__
for train_idx, val_idx in fold.split(X_train):
    train_x, val_x = X_train.loc[train_idx, :], X_train.loc[val_idx]
    train_y, val_y = y_train.loc[train_idx], y_train.loc[val_idx]

    gbr_final.fit(train_x, train_y)

    val_pred = gbr_final.predict(val_x)
    rmse = np.sqrt(mean_squared_error(val_y, val_pred))
    scores.append(rmse)

print('%s - RMSE: %.4f' % (name, np.mean(scores)))

In [None]:
# LGBM
scores = []
name = lgbm_final.__class__.__name__
for train_idx, val_idx in fold.split(X_train):
    train_x, val_x = X_train.loc[train_idx, :], X_train.loc[val_idx]
    train_y, val_y = y_train.loc[train_idx], y_train.loc[val_idx]

    lgbm_final.fit(train_x, train_y)

    val_pred = lgbm_final.predict(val_x)
    rmse = np.sqrt(mean_squared_error(val_y, val_pred))
    scores.append(rmse)

print('%s - RMSE: %.4f' % (name, np.mean(scores)))

In [None]:
enet_pred = enet_final.predict(test)
gbr_pred = gbr_final.predict(test)
lgbm_pred = lgbm_final.predict(test)

lgbm_pred

In [None]:
# gbr_pred_df = np.reshape(gbr_pred, (gbr_pred.shape[0], 1))
# new_gbr_pred = power.inverse_transform(gbr_pred_df).flatten()

# lgbm_pred_df = np.reshape(lgbm_pred, (lgbm_pred.shape[0], 1))
# new_lgbm_pred = power.inverse_transform(lgbm_pred_df).flatten()

In [None]:
sub = pd.read_csv("../input/petfinder-pawpularity-score/sample_submission.csv")

sub['Pawpularity'] = np.round(lgbm_pred, 0).astype(np.uint8)
sub.to_csv("submission.csv", index=False)
sub