In [None]:
import os
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from PIL import Image

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
import lightgbm as lgb

# CONFIG CLASS

In [None]:
class CFG:
    N_FOLDS= 5
    SEED= 1234

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    
set_seed(CFG.SEED)

In [None]:
train= pd.read_csv('../input/petfinder-pawpularity-score/train.csv')

def get_train_file_path(image_id):
    return f'../input/petfinder-pawpularity-score/train/{image_id}.jpg'

train['image_path']= train['Id'].apply(get_train_file_path)

# EDA AND PREPROCESSING

In [None]:
train.head()

In [None]:
plt.figure(figsize= (10, 5))
sns.histplot(train['Pawpularity'], kde= True)

In [None]:
sns.boxplot(x= train['Pawpularity'])

In [None]:
corr=train.corr()['Pawpularity'].sort_values()
corr

* The target variable ('Pawpularity') does not have gaussian distribution.
* Observations with pawpularity score= 100 are noisy.
* Meta features have no correlation with the target value.

In [None]:
def preprocess(df):
    df['size']= df['image_path'].apply(lambda x: Image.open(x).size)
    df['width']= df['size'].apply(lambda x: x[0])
    df['height']= df['size'].apply(lambda x: x[1])
    df= df.drop('size', axis= 1)
    return df

train= preprocess(train)

# CREATING THE FOLDS

Since the training data is not normally distributed, we will have to stratify the countinuous target variable to create the folds.

In [None]:
n_grps= 10
train['grp']= pd.cut(train['Pawpularity'], bins= n_grps, labels= False)

skf= StratifiedKFold(n_splits= CFG.N_FOLDS, shuffle= True, random_state= CFG.SEED)
for fold, (train_idx, val_idx) in enumerate(skf.split(train, train['grp'])):
    train.loc[val_idx, 'kfold']= int(fold)

In [None]:
sns.histplot(train[train['kfold']!= 0]['Pawpularity'])

In [None]:
train.head()

In [None]:
train.to_csv('train_folds.csv', index= False)

# PREDS USING META DATA

In [None]:
train_dataset= pd.read_csv('./train_folds.csv')
test= pd.read_csv('../input/petfinder-pawpularity-score/test.csv')
test_ids= test['Id']

def get_test_file_path(image_id):
    return f'../input/petfinder-pawpularity-score/test/{image_id}.jpg'

test['image_path']= test['Id'].apply(get_test_file_path)
test= preprocess(test)

In [None]:
idx= train_dataset['kfold']
y= train_dataset['Pawpularity']
X= train_dataset.drop(['Id', 'Pawpularity', 'image_path', 'grp', 'kfold'], axis= 1)
test= test.drop(['Id', 'image_path'], axis= 1)

In [None]:
kmeans= KMeans(n_clusters= 11, random_state= CFG.SEED).fit(X)
X['cluster']= kmeans.predict(X)
test['cluster']= kmeans.predict(test)

cols= X.columns

scaler= MinMaxScaler()
scaler.fit(X)
X= scaler.transform(X)
test= scaler.transform(test)

**TRAIN BASE MODELS**

In [None]:
oof_preds_lgb= np.zeros(X.shape[0])
test_preds_lgb= np.zeros(test.shape[0])

oof_preds_dtr= np.zeros(X.shape[0])
test_preds_dtr= np.zeros(test.shape[0])

for i in range(CFG.N_FOLDS):
    X_train, y_train= X[idx != i], y[idx != i]
    X_val, y_val= X[idx== i], y[idx== i]
    
    lgbr= lgb.LGBMRegressor(n_estimators= 100,
                             objective= 'rmse')
    lgbr.fit(X_train,
             y_train,
             eval_set= [(X_val, y_val)],
             eval_metric= 'rmse',
             verbose= False)
    
    oof_preds_lgb[idx== i]= lgbr.predict(X_val)
    test_preds_lgb += lgbr.predict(test)/5
    
    dtr= DecisionTreeRegressor(max_depth= 4)
    dtr.fit(X_train, y_train)
    oof_preds_dtr[idx== i]= dtr.predict(X_val)
    test_preds_dtr += dtr.predict(test)/5
    
loss_lgb= mean_squared_error(y, oof_preds_lgb, squared= False)
print(f'RMSE loss lgb: {loss_lgb}')
loss_dtr= mean_squared_error(y, oof_preds_dtr, squared= False)
print(f'RMSE loss dtr: {loss_dtr}')

In [None]:
feat_imp_lgbr= pd.DataFrame({'feature': cols, 'importance': lgbr.feature_importances_})
feat_imp_lgbr.sort_values('importance', ascending= False)

In [None]:
feat_imp_dtr= pd.DataFrame({'feature': cols, 'importance': dtr.feature_importances_})
feat_imp_dtr.sort_values('importance', ascending= False)

In [None]:
meta_df_train= pd.DataFrame({'LGBR': oof_preds_lgb, 'DTR': oof_preds_dtr})
meta_df_test= pd.DataFrame({'LGBR': test_preds_lgb, 'DTR': test_preds_dtr})

**TRAIN LINEAR REGRESSOR ON THE PREDS MADE BY THE BASE MODELS**

In [None]:
kfold= KFold(n_splits= 5)
oof_preds= np.zeros(X.shape[0])
test_preds= np.zeros(test.shape[0])

for trn_idx, val_idx in kfold.split(meta_df_train):
    X_train, X_val= meta_df_train.iloc[trn_idx], meta_df_train.iloc[val_idx]
    y_train, y_val= y[trn_idx], y[val_idx]
    
    model= LinearRegression()
    model.fit(X_train, y_train)
    
    oof_preds[val_idx]= model.predict(X_val)
    test_preds += model.predict(meta_df_test)/5
    
loss= mean_squared_error(y, oof_preds, squared= False)
print(f'RMSE loss: {loss}')

In [None]:
sub= pd.DataFrame({'Id': test_ids, 'Pawpularity': test_preds})
sub

In [None]:
sub.to_csv('submission.csv', index= False)