In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
import seaborn as sb
import xgboost as xgb

In [None]:
#データを読み込んでマージする
train = pd.read_csv("/kaggle/input/restaurantrevenue/train.csv")
test = pd.read_csv("/kaggle/input/restaurantrevenue/test.csv")


In [None]:
acc_dic = {}

train['WhatIsData'] = 'Train'
test['WhatIsData'] = 'Test'
test['revenue'] = 9999999999
alldata = pd.concat([train,test],axis=0).reset_index(drop=True)

In [None]:
alldata["Open Date"] = pd.to_datetime(alldata["Open Date"])
alldata["Year"] = alldata["Open Date"].apply(lambda x:x.year)
alldata["Month"] = alldata["Open Date"].apply(lambda x:x.month)
alldata["Day"] = alldata["Open Date"].apply(lambda x:x.day)
alldata["kijun"] = "2015-04-27"
alldata["kijun"] = pd.to_datetime(alldata["kijun"])
alldata["BusinessPeriod"] = (alldata["kijun"] - alldata["Open Date"]).apply(lambda x: x.days)

alldata = alldata.drop('Open Date', axis=1)
alldata = alldata.drop('kijun', axis=1)


In [None]:
# 訓練データ特徴量をリスト化
cat_cols = alldata.dtypes[alldata.dtypes=='object'].index.tolist()
num_cols = alldata.dtypes[alldata.dtypes!='object'].index.tolist()

In [None]:
other_cols = ['Id','WhatIsData']
# 余計な要素をリストから削除
cat_cols.remove('WhatIsData') #学習データ・テストデータ区別フラグ除去
num_cols.remove('Id') #Id削除


In [None]:
# カテゴリカル変数をダミー化
cat = pd.get_dummies(alldata[cat_cols])

# データ統合
all_data = pd.concat([alldata[other_cols],alldata[num_cols].fillna(0),cat],axis=1)

# plt.hist(np.log(train['revenue']), bins=50)
# plt.hist(train['revenue'], bins=50)

In [None]:
train_ = all_data[all_data['WhatIsData']=='Train'].drop(['WhatIsData','Id'], axis=1).reset_index(drop=True)
test_ = all_data[all_data['WhatIsData']=='Test'].drop(['WhatIsData','revenue'], axis=1).reset_index(drop=True)

In [None]:
x_ = train_.drop('revenue',axis=1)
y_ = train_.loc[:, ['revenue']]
y_ = np.log(y_)
test_feature = test_.drop('Id',axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    x_, y_, random_state=0, train_size=0.7,shuffle=True)

In [None]:
def Missing_table(df):
    null_val = df.isnull().sum()
    # null_val = df.isnull().sum()[train.isnull().sum()>0].sort_values(ascending=False)
    percent = 100 * null_val/len(df)
    # list_type = df.isnull().sum().dtypes #データ型
    Missing_table = pd.concat([null_val, percent], axis = 1)
    missing_table_len = Missing_table.rename(
    columns = {0:'欠損値', 1:'%', 2:'type'})
    return missing_table_len.sort_values(by=['欠損値'], ascending=False)

Missing_table(train)

In [None]:
# サンプルからデータ型を調べる関数
def Datatype_table(df):
        list_type = df.dtypes #データ型
        Datatype_table = pd.concat([list_type], axis = 1)
        Datatype_table_len = Datatype_table.rename(columns = {0:'データ型'})
        return Datatype_table_len
    
Datatype_table(alldata)

In [None]:
# lightGBMによる予測
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [None]:
# LightGBM parameters
params = {
        'task' : 'train',
        'boosting_type' : 'gbdt',
        'objective' : 'regression',
        'metric' : {'l2'},
        'num_leaves' : 31,
        'learning_rate' : 0.1,
        'feature_fraction' : 0.9,
        'bagging_fraction' : 0.8,
        'bagging_freq': 5,
        'verbose' : 0,
        'n_jobs': 2
}

In [None]:
gbm = lgb.train(params,
            lgb_train,
            num_boost_round=100,
            valid_sets=lgb_eval,
            early_stopping_rounds=10)

prediction_lgb = np.exp(gbm.predict(test_feature))

In [None]:
# RandomForestRegressorによる予測
forest = RandomForestRegressor().fit(X_train, y_train)
prediction_rf = np.exp(forest.predict(test_feature))

acc_forest = forest.score(X_train, y_train)
acc_dic.update(model_forest = round(acc_forest,3))
print(f"training dataに対しての精度: {forest.score(X_train, y_train):.2}")

In [None]:
# lasso回帰による予測
lasso = Lasso().fit(X_train, y_train)
prediction_lasso = np.exp(lasso.predict(test_feature))

acc_lasso = lasso.score(X_train, y_train)
acc_dic.update(model_lasso = round(acc_lasso,3))
print(f"training dataに対しての精度: {lasso.score(X_train, y_train):.2}")

In [None]:
# ElasticNetによる予測
En = ElasticNet().fit(X_train, y_train)
prediction_en = np.exp(En.predict(test_feature))
print(f"training dataに対しての精度: {En.score(X_train, y_train):.2}")

acc_ElasticNet = En.score(X_train, y_train)
acc_dic.update(model_ElasticNet = round(acc_ElasticNet,3))

# ElasticNetによるパラメータチューニング
parameters = {
        'alpha'      : [0.001, 0.01, 0.1, 1, 10, 100],
        'l1_ratio'   : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
}
En2 = GridSearchCV(ElasticNet(), parameters)
En2.fit(X_train, y_train)
prediction_en2 = np.exp(En.predict(test_feature))

acc_ElasticNet_Gs = En2.score(X_train, y_train)
acc_dic.update(model_ElasticNet_Gs = round(acc_ElasticNet_Gs,3))
print(f"training dataに対しての精度: {En.score(X_train, y_train):.2}")

In [None]:
#特徴量と目的変数をxgboostのデータ構造に変換する
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_test,label=y_test)

#パラメータを指定してGBDT
num_round = 5000
evallist = [(dvalid, 'eval'), (dtrain, 'train')]

evals_result = {}

#パラメータ
param = {
            'max_depth': 3,
            'eta': 0.01,
            'objective': 'reg:squarederror',
}

#学習の実行
bst = xgb.train(
                        param, dtrain,
                        num_round,
                        evallist,
                        evals_result=evals_result,
                        # 一定ラウンド回しても改善が見込めない場合は学習を打ち切る
                        early_stopping_rounds=1000)


In [None]:
#学習曲線を可視化する
plt.figure(figsize=(20, 10))
train_metric = evals_result['train']['rmse']
plt.plot(train_metric, label='train rmse')
eval_metric = evals_result['eval']['rmse']
plt.plot(eval_metric, label='eval rmse')
plt.grid()
plt.legend()
plt.xlabel('rounds')
plt.ylabel('rmse')
plt.ylim(0, 0.3)
plt.show()

In [None]:
#特徴量ごとの重要度を可視化する
ax = xgb.plot_importance(bst)
fig = ax.figure
fig.set_size_inches(10, 30)

In [None]:
dtest = xgb.DMatrix(test_x)
my_submission = pd.DataFrame()
my_submission["Id"] = test_ID
my_submission["SalePrice"] = np.exp(bst.predict(dtest))
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)

In [None]:
# 各モデルの訓練データに対する精度をDataFrame化
Acc = pd.DataFrame([], columns=acc_dic.keys())
dict_array = []
for i in acc_dic.items():
        dict_array.append(acc_dic)
Acc = pd.concat([Acc, pd.DataFrame.from_dict(dict_array)]).T
Acc[0]

提出用

In [None]:
# # Idを取得
# Id = np.array(test["Id"]).astype(int)
# # 予測データとIdをデータフレームへ落とし込む
# result = pd.DataFrame(prediction_lgb, Id, columns = ["Prediction"])
# # csvとして書き出し
# result.to_csv("prediction_Restaurant.csv", index_label = ["Id"])

# City_unique = alldata["City"].unique()
# df_city = pd.DataFrame(City_unique)
# # csvとして書き出し
# df_city.to_csv("CityList.csv", index_label = ["City_name"])

In [None]:
# City_unique = alldata["City"].unique()
# df_city = pd.DataFrame(City_unique)
# # csvとして書き出し
# df_city.to_csv("CityList.csv", index_label = ["City_name"])