In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import time
import math
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.metrics import mean_squared_error

random_state = 6

In [None]:
path = '../input/petfinder-pawpularity-score/'

train_df = pd.read_csv(path + 'train.csv')
test_df = pd.read_csv(path + 'test.csv')

In [None]:
train_df

In [None]:
paw_list = train_df["Pawpularity"].values.tolist()
plt.hist(paw_list, bins=20)

top_paws = train_df[train_df["Pawpularity"] >= 95]
top_paws_list = top_paws["Pawpularity"].values.tolist()

In [None]:
plt.hist(top_paws_list)

In [None]:
top_paws_high = train_df[train_df["Pawpularity"] == 100]
top_paws_high

In [None]:
y = train_df['Pawpularity']
X = train_df.drop(['Id','Pawpularity'],axis=1)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=random_state)

In [None]:
tree_clf = DecisionTreeClassifier(max_depth = 3, min_samples_split = 10)
start = time.time()
tree_clf.fit(x_train, y_train)
stop = time.time()

tree_clf_pred = tree_clf.predict(x_test)
print(f'Training time: {round((stop - start),3)} seconds')
tree_clf_RMSE = math.sqrt(mean_squared_error(y_test, tree_clf_pred))
print(f'tree_clf_RMSE: {round(tree_clf_RMSE,3)}')

In [None]:
def ActualvPredictionsGraph(y_test,y_pred, title):
    if max(y_test) >= max(y_pred):
        my_range = int(max(y_test))
    else:
        my_range = int(max(y_pred))
    plt.figure(figsize=(12,3))
    plt.scatter(range(len(y_test)), y_test, color='orange')
    plt.scatter(range(len(y_pred)), y_pred, color='green')
    plt.xlabel('Index ')
    plt.ylabel('Pawpularity ')
    plt.title(title,fontdict = {'fontsize' : 15})
    plt.legend(handles = [mpatches.Patch(color='green', label='prediction'),mpatches.Patch(color='orange', label='actual')])
    plt.show()
    return

ActualvPredictionsGraph(y_test[0:50], tree_clf_pred[0:50], "First 50 Actual v. Predicted")
ActualvPredictionsGraph(y_test, tree_clf_pred, "All Actual v. Predicted")

plt.figure(figsize=(12,4))
sns.histplot(tree_clf_pred,color='green',alpha=0.3,stat='probability', kde=True)
sns.histplot(y_test,color='orange',alpha=0.3,stat='probability', kde=True)
plt.legend(labels=['prediction','actual'])
plt.title('Actual v Predict Distribution')
plt.show()

In [None]:
def predictPaw(model, in_df):
    return model.predict(in_df)

def createInput(focus, eyes, face, near, action, acc, group, collage, human, occ, info, blur):
    return pd.DataFrame(data={
        "Subject Focus": [focus],
        "Eyes": [eyes],
        "Face": [face],
        "Near": [near],
        "Action": [action],
        "Accessory": [acc],
        "Group": [group],
        "Collage": [collage],
        "Human": [human],
        "Occlusion": [occ],
        "Info": [info],
        "Blur": [blur]
    })

def findRange(arr):
    return max(arr) - min(arr)

In [None]:
predictPaw(tree_clf, createInput(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0))

In [None]:
predictPaw(tree_clf, createInput(0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0))

In [None]:
new_train_df = train_df[train_df['Pawpularity'] < 100]
new_train_df

In [None]:
paw_list = new_train_df["Pawpularity"].values.tolist()
plt.hist(paw_list, bins=20)

In [None]:
y = new_train_df['Pawpularity']
X = new_train_df.drop(['Id','Pawpularity'],axis=1)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=random_state)

In [None]:
tree_clf = DecisionTreeClassifier(max_depth = 3, min_samples_split = 10)
start = time.time()
tree_clf.fit(x_train, y_train)
stop = time.time()

tree_clf_pred = tree_clf.predict(x_test)
print(f'Training time: {round((stop - start),3)} seconds')
tree_clf_RMSE = math.sqrt(mean_squared_error(y_test, tree_clf_pred))
print(f'tree_clf_RMSE: {round(tree_clf_RMSE,3)}')

In [None]:
ActualvPredictionsGraph(y_test[0:50], tree_clf_pred[0:50], "First 50 Actual v. Predicted")
ActualvPredictionsGraph(y_test, tree_clf_pred, "All Actual v. Predicted")

plt.figure(figsize=(12,4))
sns.histplot(tree_clf_pred,color='green',alpha=0.3,stat='probability', kde=True)
sns.histplot(y_test,color='orange',alpha=0.3,stat='probability', kde=True)
plt.legend(labels=['prediction','actual'])
plt.title('Actual v Predict Distribution')
plt.show()

In [None]:
predictPaw(tree_clf, createInput(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0))

In [None]:
predictPaw(tree_clf, createInput(0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0))

In [None]:
test_df = pd.read_csv('../input/petfinder-pawpularity-score/test.csv') #gets the data
x_test_submission = test_df.drop(['Id'],axis=1) #drops the Id column from the test_df dataframe (it already doesn't have Pawpularity so no need to remove that)
test_df['Pawpularity'] = tree_clf.predict(x_test_submission) #predict with a model you've trained, in this case tree_reg, and add the predictions to the test_df dataframe
submission_df = test_df[['Id','Pawpularity']] #keep just the Id and Pawpularity score for the submission
submission_df.to_csv("submission.csv", index=False) #save it to a .csv file called submission.csv
submission_df.head()