In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Importing Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [None]:
train_df = pd.read_csv('../input/petfinder-pawpularity-score/train.csv')
test_df = pd.read_csv('../input/petfinder-pawpularity-score/test.csv')

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
test_df.head()

In [None]:
test_df.info()

## EDA and DATA VISUALIZATION

In [None]:
train_df1 = train_df.copy()
train_df = train_df.drop('Id', axis=1)

In [None]:
train_df.describe()

In [None]:
test_df.describe()

In [None]:
# Let's check the label
fig =px.histogram(train_df, x='Pawpularity')
fig.show()

So, from above figure we can see that most pets' pawpularity lie between 20 and 40. 
Let's dive deep into it by using a box plot.

In [None]:

fig = px.box(train_df, y="Pawpularity")
fig.show()

In [None]:
plt.figure(figsize=(15, 10))
sns.heatmap(train_df.corr())

From the above figure, we can see that
1. Face anf eyes have some positive correlation
2. Info and collage also have some positive correlation
3. Occlusiona nd Human also have some positive correlation

Now, let's investigate these features and why do they have these correlations.

Face and Eyes of the pets

In [None]:
train_df.Eyes.value_counts()

In [None]:
train_df.Face.value_counts()

In [None]:
sns.scatterplot(x='Face', y='Eyes',hue= 'Pawpularity' , data= train_df1)

Okay, this might not be intuitive but when both Eyes and Faces are clearly seen in the photo, the pawpularity score comes down to 40. When Eyes are facing the front the score increases to 60. 

Occlusin and Info

In [None]:
sns.scatterplot(x='Occlusion', y='Info',hue= 'Pawpularity' , data= train_df1)

More on relationships

In [None]:
sns.pairplot(train_df)

# Let's find the correlations in test dataset

In [None]:
# Removing the Id column
test_df = test_df.drop('Id', axis=1)

In [None]:
plt.figure(figsize=(15, 10))

sns.heatmap(test_df.corr())

This is interesting we have many more correlations between the features in test dataset.

# Modelling and Evaluation

As the data is already processesed we can proceed with modelling. I'm not doing any scaling as I will be using tree and gradient based methods.

Splitting the dataset

In [None]:
X = train_df.drop('Pawpularity', axis=1)
y = train_df['Pawpularity']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state=42)

# Gradient Boosting Regressor

In [None]:
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred, squared= False)
print(mse)

# XGBOOST

In [None]:
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
mse1 = mean_squared_error(y_test, y_pred, squared=False)
print(mse1)

# Light Gradient Boosting Regressor

In [None]:
lgbm = LGBMRegressor()
lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test)
mse2 = mean_squared_error(y_test, y_pred, squared= False)
print(mse2)

# CatBoost Regressor

In [None]:
catboost = CatBoostRegressor()
catboost.fit(X_train, y_train)
y_pred = catboost.predict(X_test)
mse3 = mean_squared_error(y_test, y_pred, squared= False)

In [None]:
print(mse3)

Okay, I'm not satisfied by the results. So let's try something else. Decision Trees.

DECISION TREES

In [None]:
tree_reg =DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)
y_pred = tree_reg.predict(X_test)
mse4 = mean_squared_error(y_test, y_pred, squared= False)
print(mse4)

Summary of the scores

In [None]:
results = [{"Gradient Boosting":mse,'XGBoost': mse1,'LGBMRegression': mse2,'CatBoostRegressor':mse3, 'Decision Tree': mse4} ]

In [None]:
results_df = pd.DataFrame(results)
results_df

As we have seen the best models are:
1. Gradient Boosting Regressor
2. LGBM Regression
3. CatBoost Regression

So using these three models we use will make a stacked regressor using scikit learn's function.

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingRegressor.html

In [None]:
estimators = [
    ('LGBM', LGBMRegressor()),
    ('catboost', CatBoostRegressor())]

In [None]:
reg = StackingRegressor(estimators=estimators,
                        final_estimator=GradientBoostingRegressor(random_state=42))

In [None]:
reg.fit(X_train, y_train).score(X_test, y_test)

In [None]:
y_pred = reg.predict(X_test)

In [None]:
MSE = mean_squared_error(y_test, y_pred, squared = False)
print(MSE)

Slightly better. Gradient Boosting Regressor had RMSE of 21.1146.

Let's predict on test dataset.

In [None]:
Y_pred = reg.predict(test_df)

In [None]:
Y_pred

Submission file

In [None]:
submission_file = pd.read_csv('../input/petfinder-pawpularity-score/sample_submission.csv')

In [None]:
submission_file['Pawpularity'] = Y_pred

In [None]:
submission_file

In [None]:
submission_file.to_csv("submission.csv", index=False)
submission_file.head()