In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import mutual_info_regression

from xgboost import XGBRegressor
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import numpy as np

In [None]:
def make_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y, discrete_features=X)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


In [None]:
train_dir_fold = "../input/petfinder-pawpularity-score/train"
test_dir_fold = "../input/petfinder-pawpularity-score/test"

train_dir = "../input/petfinder-pawpularity-score/train.csv"
test_dir = "../input/petfinder-pawpularity-score/test.csv"

In [None]:
X = pd.read_csv(train_dir, index_col="Id")
X_test = pd.read_csv(test_dir, index_col="Id")

Y = X["Pawpularity"]
X = X.drop(["Pawpularity"], axis=1)

Y = Y / 100

In [None]:
sums = []
means = []

for i in X.index:
    sum_ = X.loc[i, :].sum()
    mean_ = X.loc[i, :].mean()
    sums.append(sum_)
    means.append(mean_)

In [None]:
X["Sum"] = sums
X["Mean"] = means

In [None]:
sums = []
means = []

for i in X_test.index:
    sum_ = X_test.loc[i, :].sum()
    mean_ = X_test.loc[i, :].mean()
    sums.append(sum_)
    means.append(mean_)

In [None]:
X_test["Sum"] = sums
X_test["Mean"] = means

In [None]:
X

In [None]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(make_mi_scores(X.drop("Mean", axis=1), Y))

In [None]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, random_state=0)
print(Y_valid[:10])

In [None]:
n_estimators = range(50)

In [None]:
def score_estims(estim):
    model = XGBRegressor(n_estimators=estim, random_state=0)
    model.fit(X_train, Y_train)
    
    return mean_absolute_error(Y_valid, model.predict(X_valid))

def score_depth(depth):
    model = XGBRegressor(n_estimators=15, max_depth=depth, random_state=0)
    model.fit(X_train, Y_train)
    
    return mean_absolute_error(Y_valid, model.predict(X_valid))

In [None]:
estims_errors = [score_estims(i) for i in n_estimators]

In [None]:
sns.histplot(data=estims_errors)

In [None]:
def score_estims(estim):
    model = RandomForestRegressor(n_estimators=estim, random_state=0)
    model.fit(X_train, Y_train)
    
    return mean_absolute_error(Y_valid, model.predict(X_valid))

In [None]:
estims_errors_2 = [score_estims(i+1) for i in n_estimators]

In [None]:
plt.figure(figsize=(16, 9))

sns.lineplot(data=estims_errors_2)

In [None]:
for i in range(len(estims_errors_2)):
    if estims_errors_2[i] == min(estims_errors_2):
        print(i+1)

In [None]:
max_depths = [5, 10, 15, 20, 50, 100]

In [None]:
for i in max_depths:
    print(score_depth(i))

In [None]:
Y_train

In [None]:
model1 = XGBRegressor(n_estimators=15, max_depth=5, learning_rate=0.3)
model1.fit(X_train,
          Y_train,
          verbose=False
         )

In [None]:
model2 = RandomForestRegressor(n_estimators=14)
model2.fit(X_train,
          Y_train
         )

In [None]:
class Model:
    
    def __init__(self, model1, model2):
        self.model1 = model1
        self.model2 = model2
        
    def predict(self, X):
        pred1 = self.model1.predict(X)
        pred2 = self.model2.predict(X)
        
        preds = []
        
        for i in range(len(pred1)):
            p1 = pred1[i]
            p2 = pred2[i]
            
            preds.append(np.array([p1, p2]).mean())
            
        return np.array(preds)

In [None]:
model = Model(model1, model2)

In [None]:
pred = model.predict(X_valid)
mean_absolute_error(Y_valid, pred) * 100

In [None]:
pred_test = model.predict(X_test)

In [None]:
df = pd.DataFrame({
    "Id": X_test.index,
    "Pawpularity": (pred_test*100).astype(np.int64)
})

In [None]:
df

In [None]:
df.to_csv("submission.csv", index=False)