# Import Modules

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor 
import os 
import shap 
sns.set() 

# Import Data

In [None]:
# Create pandas data frame
data = pd.read_csv("/kaggle/input/petfinder-pawpularity-score/train.csv")
data = pd.DataFrame(data)

# Define target and columns to drop
target = "Pawpularity"
drop_cols = ["Id", target]  # ID not a predictive column

# Split in to X and Y
Y = data[target]
X = data.drop(columns = drop_cols)

# Perform a cross-validation to verify metrics

## Instantiate Metrics Graph

In [None]:
scores = {
	"R2": r2_score,
	"mae" : mean_absolute_error,
	"mse" : mean_squared_error
}
metrics = {}
for score in scores:
	metrics[score] = [] 

## Cross Validation - 5-fold

In [None]:
N = 5
plotting = False
for k in range(N):

    # Train test split 
    trainX, validateX, trainY, validateY = train_test_split(X, Y, test_size = 0.2) 

    # Train model 
    model = XGBRegressor() 
    model.fit(trainX, trainY) 

    # Predictions 
    predictions = model.predict(validateX) 

    # Validate predictions with scores 
    for score in scores:
        value = scores[score](validateY, predictions)
        metrics[score].append(value)

    if plotting:
        plt.scatter(predictions, validateY)
        plt.show()

## Average cross-validation scores

In [None]:
average_scores = {}
for score in scores:
	avg = np.mean(metrics[score])
	print(score + ": " + str(avg) )
	average_scores[score] = np.mean(avg) 
    

# Now that scores are verified, train overall model

In [None]:
model = XGBRegressor()
model.fit(X, Y) 

# See which features are most important using Shapley values

Answering the question of, "On average, how much does a feature influence the outcome of 'Pawpularity'?"

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X) 
shap_values = pd.DataFrame(shap_values, columns = X.columns)

# Need to create some mean_values
mean_vals = {}
for column in X.columns:
	mean_vals[column] = np.mean(shap_values[column]) 

plt.barh(list(mean_vals.keys()), list(mean_vals.values()))
plt.tight_layout()
plt.show()

# Predict test data

In [None]:
test_data = pd.read_csv("/kaggle/input/petfinder-pawpularity-score/test.csv")
test_data = pd.DataFrame(test_data)

# Drop columns not predictive
test_drop_cols = ["Id"]
to_predict = test_data.drop(columns = test_drop_cols)  # don't need ID to be predictive

## Create set of predictions

In [None]:
submission = pd.DataFrame()
predictions = model.predict(to_predict)

# Check Shapley values on test dataset

In [None]:
explainer = shap.TreeExplainer(model)
test_shap_values = explainer.shap_values(to_predict) 
test_shap_values = pd.DataFrame(test_shap_values, columns = to_predict.columns)

# Need to create some mean_values
test_mean_vals = {}
for column in to_predict.columns:
	test_mean_vals[column] = np.mean(to_predict[column]) 

plt.barh(list(test_mean_vals.keys()), list(test_mean_vals.values()))
plt.tight_layout()
plt.show()

# Submission

In [None]:
submission["Id"] = test_data["Id"]
submission["Pawpularity"] = predictions

# Save to .csv 
submission.to_csv("submission.csv", index = False)