## Library

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data

In [None]:
df_train = pd.read_csv("/kaggle/input/petfinder-pawpularity-score/train.csv",)
df_test = pd.read_csv("/kaggle/input/petfinder-pawpularity-score/test.csv")

We don't have nulls

In [None]:
df_train.isna().sum()

In [None]:
df_train.describe()

In [None]:
plt.figure(figsize = (10,5))
sns.set_palette("pastel")
sns.histplot(data = df_train, x = "Pawpularity", kde = True)
plt.title('Distribution of Pawpularity Scores', fontsize = 20)
plt.axvline(df_train["Pawpularity"].mean(), c = "green", ls = "--", label = "Mean Pawpularity")
plt.axvline(df_train["Pawpularity"].median(), c = "black", ls = "--", label = "Median Pawpularity")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize = (15, 10))
sns.heatmap(df_train.corr(), annot = True)

Positive correlations:
- Eyes with Face (0.58): If you see decently clear face, you may see the eyes as well.
- Occlusion with Human (0.63): Probably the part of humam body is the undesired object blocking part of the pet.

Negative correlations:
- Blur with Eyes (-0.51): When Blur is true the Eyes is automatically set to zero as stated in the competition's data descriptiion.
- Group with Near (-0.32): If we have more than one pet in the photo, you need to zoom out to take a picture.

In [None]:
print(df_train.corr("spearman")["Pawpularity"].sort_values(ascending = False))
print(df_train.corr("kendall")["Pawpularity"].sort_values(ascending = False))

In [None]:
# Preparing our training set
X = df_train.copy()
y = X.pop("Pawpularity")

X.drop("Id", axis = 1, inplace = True)

## Baseline score

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

model = RandomForestRegressor()
score = cross_val_score(
    model, X, y, scoring = "neg_root_mean_squared_error"
)

baseline_score = - score.mean()
baseline_score

In [None]:
# First submission with baseline score, just to see the performance on test set
X_test = df_test.drop("Id", axis = 1)

model_1 = RandomForestRegressor()
model_1.fit(X, y)

y_pred = model_1.predict(X_test).round(2)

dic = {"Id": df_test["Id"], "Pawpularity": y_pred}
df_submit = pd.DataFrame(data = dic)
df_submit.to_csv("submission.csv", index = False)

In [None]:
def score_dataset(X, y, model):
    model.fit(X, y)
    score = cross_val_score(
        model, X, y, scoring = "neg_root_mean_squared_error"
    )

    score = - score.mean()
    return score

# Feature Engineering

## Mutual Information

In [None]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y):
    X = X.copy()
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features = discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name = "MI Scores", index = X.columns)
    mi_scores = mi_scores.sort_values(ascending = False)
    return mi_scores

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending = True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

In [None]:
mi_scores = make_mi_scores(X, y)
mi_scores

In [None]:
def drop_uninformative(df, mi_scores):
    return df.loc[:, mi_scores > 0.0]

X = drop_uninformative(X, mi_scores)

rmse = score_dataset(X, y, RandomForestRegressor(n_estimators = 500))
rmse

In [None]:
# Second submission with RF regressor, use just mutual info
X_test = df_test.drop("Id", axis = 1)
X_test = drop_uninformative(X_test, mi_scores)

model_2 = RandomForestRegressor(n_estimators = 500)
model_2.fit(X, y)

y_pred = model_2.predict(X_test).round(2)

dic = {"Id": df_test["Id"], "Pawpularity": y_pred}
df_submit = pd.DataFrame(data = dic)
df_submit.to_csv("submission.csv", index = False)

## PCA

## Cluster


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor()
score_dataset(X, y, model)

In [None]:
from xgboost import XGBRegressor

model = XGBRegressor()
score_dataset(X, y, model)

In [None]:
# score_dataset(X, y, XGBRegressor(n_estimators = 5000, learning_rate = 0.01, n_jobs = 4))
# 20.60

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 0)

model = XGBRegressor(n_estimators = 1000, learning_rate = 0.5, n_jobs = 4)
model.fit(X_train, y_train, early_stopping_rounds = 5, eval_set = [(X_valid, y_valid)], verbose = False)
predict = model.predict(X_valid)

from sklearn.metrics import mean_squared_error

mean_squared_error(predict, y_valid, squared = False)

Seens like random forest regressor do quite a good job.