In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm


In [2]:
# csvデータの読み込み。
train = pd.read_csv('read_merge_csv/train_new.csv')
test = pd.read_csv('read_merge_csv/test_new.csv')
train['week'] = train['gameday'].str[-2].replace(('月','火','水','木','金'), '平日')
test['week'] = test['gameday'].str[-2].replace(('月','火','水','木','金'), '平日')

train['match'] = train['match'].str.split('第', expand = True)[1].str.split('節',expand=True)[0].astype('int')
test['match'] = test['match'].str.split('第', expand = True)[1].str.split('節',expand=True)[0].astype('int')

train['gameday'] = train['gameday'].str.split('/', expand = True)[0].astype('int')
test['gameday'] = test['gameday'].str.split('/', expand = True)[0].astype('int')

train['time'] = train['time'].str.split(':', expand = True)[0].astype('int')
test['time'] = test['time'].str.split(':', expand = True)[0].astype('int')

In [3]:
# 使用変数の作成。
train['stage'] = pd.factorize(train['stage'])[0]
test['stage'] = pd.factorize(test['stage'])[0]

train['match'] = pd.factorize(train['match'])[0]
test['match'] = pd.factorize(test['match'])[0]

train['home'] = pd.factorize(train['home'])[0]
test['home'] = pd.factorize(test['home'])[0]

train['stadium'] = pd.factorize(train['stadium'])[0]
test['stadium'] = pd.factorize(test['stadium'])[0]

train['referee'] = pd.factorize(train['referee'])[0]
test['referee'] = pd.factorize(test['referee'])[0]

train['referee'] = pd.factorize(train['referee'])[0]
test['referee'] = pd.factorize(test['referee'])[0]

train['away'] = pd.factorize(train['away'])[0]
test['away'] = pd.factorize(test['away'])[0]

train['week'] = pd.factorize(train['week'])[0]
test['week'] = pd.factorize(test['week'])[0]

train['address'] = pd.factorize(train['address'])[0]
test['address'] = pd.factorize(test['address'])[0]

# 変数選択。
lm_train = train[['y','capa', 'stage', 'stadium', 'match', 'week', 'home', 'away', 'referee', 'address']]
lm_test = test[['capa','stage', 'stadium', 'match', 'week', 'home', 'away', 'referee', 'address']]

In [4]:
# クロスバリデーション
from sklearn.model_selection import KFold

n_split = 5
cv = KFold(n_split)
sub_pred = np.zeros(len(lm_test))
for train_idx, valid_idx in cv.split(X=lm_train):
    
    train_x = lm_train.drop('y', axis=1).iloc[train_idx]
    valid_x = lm_train.drop('y', axis=1).iloc[valid_idx]
    train_y = lm_train['y'].iloc[train_idx]
    valid_y = lm_train['y'].iloc[valid_idx]
    
    model = LinearRegression().fit(train_x, train_y)
    
    # RMSEの算出。
    pred_valid = model.predict(valid_x)
    print(np.sqrt(mean_squared_error(valid_y, pred_valid)))
    
    sub_pred += model.predict(lm_test)
sub_pred /= n_split

6846.423590712795
7288.949003518723
3605.533817859825
3056.6003460829097
3125.8265670747096


In [7]:
# データをcsv形式に加工。
out = pd.DataFrame({'id':test['id'], 'y':sub_pred})

# idでソート。
out_sorted = out.sort_values('id')

out_sorted.to_csv('read_merge_csv/submit_0628_1_lm.csv', sep=',', header=False, index=False)

In [8]:
# csvのフォーマット確認。
#出力ファイルと投稿用のサンプルファイルを読込。
sample = pd.read_csv(filepath_or_buffer="read_merge_csv/sample_submit.csv")
submit = pd.read_csv(filepath_or_buffer="read_merge_csv/submit_0628_1_lm.csv")

#インデックスを比較。
assert len(sample.index.values) == len(submit.index.values)

# 行番号の検証。
assert (sample.index.values == submit.index.values).all()

#列名の検証。
assert sample.columns[0] == submit.columns[0]
assert 'y' != submit.columns[0]