In [None]:
import os
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from PIL import Image
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
RANDOM_STATE = 7
N_SPLITS = 5
N_REPEATS = 2
ATTRIBUTE_COLS = ['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory', 'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur']
DIM_COLS = ['width', 'height', 'aspect_ratio']
BASE_PATH = '../input/petfinder-pawpularity-score'

@np.vectorize
def get_train_filepath(data_id):
    return os.path.join(BASE_PATH, 'train', data_id + '.jpg')

@np.vectorize
def get_test_filepath(data_id):
    return os.path.join(BASE_PATH, 'test', data_id + '.jpg')

@np.vectorize
def get_img_dim(filepath):
    return Image.open(filepath).size

In [None]:
train_df = pd.read_csv("../input/petfinder-pawpularity-score/train.csv")
train_df.head()

In [None]:
train_df["filepath"] = get_train_filepath(train_df['Id'])
train_df[['width', 'height']] = np.array(get_img_dim(train_df["filepath"])).T
train_df['aspect_ratio'] = train_df['width'] / train_df['height']
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.isnull().sum()

In [None]:
train_df.describe().round(3)

In [None]:
plt.figure(figsize = (16, 8))
sns.countplot(x = "variable", hue = "value", data = pd.melt(train_df[ATTRIBUTE_COLS]))
plt.title("Training Data - Attributes", fontsize = 20)
plt.xlabel("Attribute", fontsize = 16)
plt.ylabel("Count", fontsize = 16)
plt.show()

In [None]:
plt.figure(figsize = (16, 8))
sns.histplot(train_df['Pawpularity'], bins = 100, binrange = (0, 100), stat = 'density', alpha = 0.4)
sns.kdeplot(train_df['Pawpularity'], color = 'red', clip = (0, 100))
plt.title("Training Data - Pawpularity", fontsize = 20)
plt.xlabel("Pawpularity", fontsize = 16)
plt.ylabel("Density", fontsize = 16)
plt.show()

In [None]:
plt.figure(figsize = (16, 8))
sns.kdeplot(data = train_df[["width", "height"]], common_norm = False)
plt.title("Training Data - Dimensions", fontsize = 20)
plt.xlabel("Pixels", fontsize = 16)
plt.ylabel("Density", fontsize = 16)
plt.legend(["Width", "Height"], loc = "upper left", fontsize = 16)
plt.show()

In [None]:
plt.figure(figsize = (12, 8))
sns.kdeplot(data = train_df["aspect_ratio"], common_norm = False)
plt.title("Training Data - Aspect Ratio", fontsize = 20)
plt.xlabel("Aspect Ratio", fontsize = 16)
plt.ylabel("Density", fontsize = 16)
plt.xticks(np.arange(int(min(train_df["aspect_ratio"])), int(max(train_df["aspect_ratio"])) + 1, 1.0))
plt.show()

In [None]:
test_df = pd.read_csv("../input/petfinder-pawpularity-score/test.csv")
test_df.head()

In [None]:
test_df["filepath"] = get_test_filepath(test_df['Id'])
test_df[['width', 'height']] = np.array(get_img_dim(test_df["filepath"])).T
test_df['aspect_ratio'] = test_df['width'] / test_df['height']
test_df.head()

In [None]:
test_df.info()

In [None]:
test_df.isnull().sum()

In [None]:
test_df.describe().round(3)

In [None]:
scaler = ColumnTransformer(
    [('std_scaler', StandardScaler(), DIM_COLS)],
    remainder = 'passthrough'
)

pipeline = Pipeline([
    ('scaler', scaler),
    ('elasticnet', ElasticNet(max_iter = 10_000, random_state = RANDOM_STATE))
])

pipeline

In [None]:
rkf = RepeatedKFold(n_splits = N_SPLITS, n_repeats = N_REPEATS, random_state = RANDOM_STATE)

params = {
    "elasticnet__alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10],
    "elasticnet__l1_ratio": [0.1 * i for i in range(11)]
}

model = GridSearchCV(
    pipeline,
    n_jobs = -1, 
    param_grid = params, 
    cv = rkf
)

model

In [None]:
model.fit(train_df[ATTRIBUTE_COLS + DIM_COLS], train_df['Pawpularity'] / 100)

print(f"Best Score: {model.best_score_}")
print(f"Best Params: {model.best_params_}")

In [None]:
print(f"Model Intercept: {model.best_estimator_[1].intercept_}")

coefs = dict(zip(ATTRIBUTE_COLS + DIM_COLS, model.best_estimator_[1].coef_))
print(f"\nModel Coefficients: ")
for k, v in coefs.items():
    print(f"\t{k}: {v}")

In [None]:
test_df['Pawpularity'] = model.predict(test_df[ATTRIBUTE_COLS + DIM_COLS]) * 100
test_df['Pawpularity'] = test_df['Pawpularity'].round(3).clip(1, 100)
test_df.head()

In [None]:
test_df[['Id', 'Pawpularity']].to_csv('submission.csv', index = False)