In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# ライブラリのインポート

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# データの確認

In [None]:
#Train Data
meta_data=pd.read_csv('../input/petfinder-pawpularity-score/train.csv')
print(len(meta_data))
# meta_data[30:60]
meta_data.head()

In [None]:
#Test data
test=pd.read_csv('../input/petfinder-pawpularity-score/test.csv')
print(len(meta_data))
meta_data.head()

In [None]:
x_train,x_test,y_train,y_test=train_test_split(meta_data.drop(['Pawpularity','Id'],axis=1),meta_data.Pawpularity)

y_test.head()

In [None]:
def reg_score(real,pred):    #Uses it to check deviation of predicted output with a +2,-1 error
    count=0
    for r,p in zip(real,pred):
        if r-2<=p<=r+2:count+=1
    return count/len(real)

# 学習
色々な回帰モデル

In [None]:
models = {
    'DecisionTree': DecisionTreeRegressor(random_state=0),
    'AdaBoost': AdaBoostRegressor(DecisionTreeRegressor(random_state=0), random_state=0),
    'Bagging': BaggingRegressor(DecisionTreeRegressor(random_state=0), n_estimators=100, random_state=0),
    'Bagging & AdaBoost': AdaBoostRegressor(BaggingRegressor(DecisionTreeRegressor(random_state=0),
                                                   n_estimators=100,
                                                   random_state=0
                                                  ),
                                  random_state=0
                                 ),
    'RandomForest': RandomForestRegressor(random_state=0),
    'GradientBoost': GradientBoostingRegressor(random_state=0),
    'LinearRegression': LinearRegression(normalize=True),
    'XGB': XGBRegressor(n_estimators=1000, max_depth=6, eta=0.1, subsample=0.7, colsample_bytree=0.8)
}

scores = {}
for model_name, model in models.items():
    model.fit(x_train, y_train)
    predict=model.predict(x_test)
    # rmse = np.sqrt(mean_squared_error(y_test,predict, squared= False))
    mse = mean_squared_error(y_test,predict, squared= False)
    scores[(model_name, 'train_score')] = model.score(x_train, y_train)
    scores[(model_name, 'test_score')] = model.score(x_test, y_test)
    scores[(model_name, 'reg_score')] = reg_score(y_test,predict)
    scores[(model_name, 'mse')] = mse

pd.Series(scores).unstack()

# アンサンブル学習
アンサンブルの基本は多数決で、複数のモデルの予測結果を統合する。\
ひとつのモデルだと間違えてしまう場合でも、他のモデルと補い合うことで間違え（予測ミス）を減らそうという考え方。\
クラスを予測する分類問題では多数決、数値を直接予測する回帰問題では平均をとるのが一般的。
# 基本的なアンサンブル学習手法
1. Max Voting（最大値投票アンサンブル）
1. Weighted Average Voting(重量平均投票)


In [None]:
# 1.Max Voting（最大値投票アンサンブル）
from sklearn.ensemble import VotingRegressor

reg1 = models['GradientBoost'] # 
reg2 = models['RandomForest'] # 
reg3 = models['LinearRegression']
ereg = VotingRegressor(estimators=[('xgb', reg1), ('rf', reg2), ('lr', reg3)])

ereg = ereg.fit(x_train, y_train)

# 自由度調整済みr2を算出
def adjusted_r2(X,Y,model):
    from sklearn.metrics import r2_score
    import numpy as np
    r_squared = r2_score(Y, model.predict(X))
    adjusted_r2 = 1 - (1-r_squared)*(len(Y)-1)/(len(Y)-X.shape[1]-1)
    #yhat = model.predict(X) \ #SS_Residual = sum((Y-yhat)**2) \ #SS_Total = sum((Y-np.mean(Y))**2)
    #r_squared = 1 - (float(SS_Residual))/ SS_Total
    return adjusted_r2

# 予測モデルの精度確認の各種指標を算出
def get_model_evaluations(X_train,Y_train,X_test,Y_test,model):
    from sklearn.metrics import explained_variance_score
    from sklearn.metrics import mean_absolute_error
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import mean_squared_log_error
    from sklearn.metrics import median_absolute_error

   # 評価指標確認
   # 参考: https://funatsu-lab.github.io/open-course-ware/basic-theory/accuracy-index/
    yhat_test = model.predict(X_test)
    return "adjusted_r2(train)     :" + str(adjusted_r2(X_train,Y_train,model)) \
         , "adjusted_r2(test)      :" + str(adjusted_r2(X_test,Y_test,model)) \
         , "平均誤差率(test)       :" + str(np.mean(abs(Y_test / yhat_test - 1))) \
         , "MAE(test)              :" + str(mean_absolute_error(Y_test, yhat_test)) \
         , "MedianAE(test)         :" + str(median_absolute_error(Y_test, yhat_test)) \
         , "RMSE(test)             :" + str(np.sqrt(mean_squared_error(Y_test, yhat_test))) \
         , "RMSE(test) / MAE(test) :" + str(np.sqrt(mean_squared_error(Y_test, yhat_test)) / mean_absolute_error(Y_test, yhat_test)) #better if result = 1.253

get_model_evaluations(x_train,y_train,x_test,y_test,ereg)

# 提出用の学習器

In [None]:
SubmissionModel = RandomForestRegressor(random_state=0) # ここの右辺を提出したい学習器のものを定義する
SubmissionModel.fit(x_train,y_train)
predict = SubmissionModel.predict(x_test)
print('test_score: ' + str(SubmissionModel.score(x_test,y_test)))
print('reg_score: ' + str(reg_score(y_test,predict)))
# rmse = np.sqrt(mean_squared_error(y_test,predict, squared= False))
mse = mean_squared_error(y_test,predict, squared= False)
print('mse: ' + str(mse))

In [None]:
submit = pd.read_csv('../input/petfinder-pawpularity-score/sample_submission.csv')
submit['Pawpularity'] = predict = ereg.predict(test.drop(['Id'],axis=1))
submit.head(10)

# 提出

In [None]:
submit.to_csv('submission.csv',index=False)