# Introduction

## Data Preview

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [None]:
df = pd.read_csv("../input/petfinder-pawpularity-score/train.csv")
df.head()

In [None]:
df.info()

In [None]:
submission_df = pd.read_csv("../input/petfinder-pawpularity-score/test.csv")
submission_df.head()

# Data Exploration

## create image width/height/size features

In [None]:
from PIL import Image
import os

size_data = []
for idt in df['Id']:
    p = os.path.join('../input/petfinder-pawpularity-score/train',idt + '.jpg')
    image = Image.open(p)
    width, height = image.size
    size_data.append([width, height, width*height])

size_df = pd.DataFrame(size_data, columns=['width', 'height', 'size'])

In [None]:
result_df = pd.concat([df, size_df],  axis=1)

## Data distribution

In [None]:
fig, ax = plt.subplots()
ax.set_title('Label Plot')
ax.boxplot(df["Pawpularity"])
plt.show()

In [None]:
import seaborn as sns
for col in df.columns[1:-1]:
    fig, axs = plt.subplots(figsize=(12,4), ncols=2)
    sns.histplot(df.iloc[df[col].to_numpy() == 0, -1], ax=axs[0]).set_title(col + '=0')
    sns.histplot(df.iloc[df[col].to_numpy() == 1, -1], ax=axs[1]).set_title(col + '=1')

## Metadata correlation

In [None]:
from matplotlib.pyplot import figure
import seaborn as sns

fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot()
""
corr_data = result_df.iloc[:, 1:].corr()
sns.heatmap(corr_data)
plt.show()

## Performance on basic model

In [None]:
train_data, test_data = np.split(df, [int(.95*len(df))])
X_train = train_data.iloc[:, 1:-1]
y_train = train_data.iloc[:, -1]
X_test = test_data.iloc[:, 1:-1]
y_test = test_data.iloc[:, -1]
y_train_bool = y_train > 60
y_test_bool = y_test < 60

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

def create_model():
    # return RandomForestRegressor(n_estimators=20, max_depth=8, random_state=1)
    return RandomForestRegressor(n_estimators=20, max_depth=8, random_state=1)

def create_classification_model():
    return RandomForestClassifier(n_estimators=20, max_depth=8, random_state=1)
    # return MLPClassifier(hidden_layer_sizes=(8, 8, 8), max_iter=50000, random_state=1)
    # return GradientBoostingClassifier(n_estimators=300, learning_rate=0.5, max_depth=1, max_features=8, random_state=1)

In [None]:
regressor = create_model()
regressor.fit(X_train, y_train)

classifier = create_classification_model()
classifier.fit(X_train, y_train_bool)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, f1_score

train_predictions = regressor.predict(X_train)
train_predictions_bool = classifier.predict(X_train)
train_mse = mean_squared_error(y_train, train_predictions)
train_r2 = r2_score(y_train, train_predictions)
train_f1 = f1_score(y_train_bool, train_predictions_bool)

test_predictions = regressor.predict(X_test)
test_predictions_bool = classifier.predict(X_test)
test_mse = mean_squared_error(y_test, test_predictions)
test_r2 = r2_score(y_test, test_predictions)
test_f1 = f1_score(y_test_bool, test_predictions_bool)

print("train_mse: ", train_mse, ", train_r2: ", train_r2, ", train_f1: ", train_f1)
print("test_mse: ", test_mse, ", test_r2: ", test_r2, ", test_f1: ", test_f1)

In [None]:
importances = regressor.feature_importances_
std = np.std([tree.feature_importances_ for tree in regressor.estimators_], axis=0)

forest_importances = pd.Series(importances, index=X_train.columns)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()