In [118]:
# 必要なライブラリのimport
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.model_selection import train_test_split

# DataFrameですべての列を表示する設定（変更しないでください）
pd.options.display.max_columns = None

# データセットを読み込む
train_dataset = pd.read_csv("data/train.csv")
test_dataset = pd.read_csv("data/test.csv")

In [119]:
print(train_dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207 entries, 0 to 206
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   datetime       207 non-null    object 
 1   y              207 non-null    int64  
 2   week           207 non-null    object 
 3   soldout        207 non-null    int64  
 4   name           207 non-null    object 
 5   kcal           166 non-null    float64
 6   remarks        21 non-null     object 
 7   event          14 non-null     object 
 8   payday         10 non-null     float64
 9   weather        207 non-null    object 
 10  precipitation  207 non-null    object 
 11  temperature    207 non-null    float64
dtypes: float64(3), int64(2), object(7)
memory usage: 19.5+ KB
None


In [120]:
# 前処理
def preprocess(dataset):

  #kcalを平均で埋める
  dataset['kcal'] = dataset['kcal'].fillna(dataset['kcal'].mean())

  # remarksに特定の文字列が含まれているか
  dataset['remarks_お楽しみ'] = dataset['remarks'].str.contains('お楽しみメニュー').fillna(False)

  # remarksに値があればTrue, なければFalseにする
  dataset['remarks'] = dataset['remarks'].notnull() 

  # eventsに値があればTrue, なければFalseにする
  dataset['event'] = dataset['event'].notnull()

  # weekをone-hotエンコーディング
  dataset = pd.get_dummies(dataset, columns=["week"], dummy_na=True)

  # weatherをone-hotエンコーディング
  dataset = pd.get_dummies(dataset, columns=["weather"], dummy_na=True)

  # paydayに値があればTrue, なければFalseにする
  dataset['payday'] = dataset['payday'].notnull()

  # precipitationを削除
  dataset = dataset.drop(columns=['precipitation'])

  # nameに特定の文字列が含まれているか
  dataset['good_menu'] = dataset['name'].str.contains('ハンバーグ|カレー|チキン|牛|ポーク|鶏|豚').fillna(False)

  # nameを削除
  dataset = dataset.drop(columns=['name'])

  # datetimeを0からの整数に変換
  dataset['datetime'] = dataset.index

  return dataset


In [121]:
# trainとtestを結合して前処理

dataset = pd.concat([train_dataset, test_dataset], ignore_index=True)

dataset2 = preprocess(dataset)

train_dataset2 = dataset2[82:207].copy()
test_dataset2 = dataset2[207:].copy()

print(train_dataset2.info())
print(train_dataset2.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 82 to 206
Data columns (total 24 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   datetime      125 non-null    int64  
 1   y             125 non-null    float64
 2   soldout       125 non-null    int64  
 3   kcal          125 non-null    float64
 4   remarks       125 non-null    bool   
 5   event         125 non-null    bool   
 6   payday        125 non-null    bool   
 7   temperature   125 non-null    float64
 8   remarks_お楽しみ  125 non-null    bool   
 9   week_月        125 non-null    bool   
 10  week_木        125 non-null    bool   
 11  week_水        125 non-null    bool   
 12  week_火        125 non-null    bool   
 13  week_金        125 non-null    bool   
 14  week_nan      125 non-null    bool   
 15  weather_快晴    125 non-null    bool   
 16  weather_晴れ    125 non-null    bool   
 17  weather_曇     125 non-null    bool   
 18  weather_薄曇    125 non-null   

  dataset['remarks_お楽しみ'] = dataset['remarks'].str.contains('お楽しみメニュー').fillna(False)


In [122]:
train_dataset2.head(20)

Unnamed: 0,datetime,y,soldout,kcal,remarks,event,payday,temperature,remarks_お楽しみ,week_月,week_木,week_水,week_火,week_金,week_nan,weather_快晴,weather_晴れ,weather_曇,weather_薄曇,weather_雨,weather_雪,weather_雷電,weather_nan,good_menu
82,82,55.0,1,315.0,False,False,False,12.4,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False
83,83,106.0,0,407.381188,True,False,False,18.5,True,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True
84,84,84.0,0,349.0,False,False,False,17.5,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False
85,85,125.0,1,377.0,False,False,False,16.1,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False
86,86,99.0,0,380.0,False,False,False,18.2,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False
87,87,100.0,1,390.0,False,False,False,14.3,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,True
88,88,90.0,1,350.0,False,False,False,17.8,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True
89,89,107.0,0,387.0,False,False,False,13.8,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,True
90,90,96.0,1,353.0,False,False,False,20.1,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False
91,91,88.0,1,376.0,False,False,False,19.0,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False


In [123]:
# 目的変数と説明変数に分ける
X = train_dataset2.drop(columns=['y'])
y = train_dataset2['y']

# 学習データと検証データに分ける
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
# ランダムフォレストで学習
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)

# 検証
from sklearn.metrics import mean_squared_error
y_val_pred = model.predict(X_val)
mean_squared_error(y_val, y_val_pred)



[ 69.76  63.01 120.19  55.94 119.96  53.96  69.54  56.47 100.37  58.15
  66.66  60.53  58.22  61.41  53.86  88.48  89.76  59.09  71.4   59.
  60.78  54.58  58.58  60.9   83.55  86.21  55.16  64.01 116.48  56.96
  74.3   56.4   57.87  66.53  52.46  49.51  97.06  57.64]


In [125]:
# ロジスティック回帰で学習
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=100)
model.fit(X_train, y_train)
# 検証
from sklearn.metrics import mean_squared_error
y_val_pred = model.predict(X_val)
mean_squared_error(y_val, y_val_pred)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


529.6315789473684

In [126]:
# SVMで学習
from sklearn.svm import SVR
model = SVR()
model.fit(X_train, y_train)
# 検証
from sklearn.metrics import mean_squared_error
y_val_pred = model.predict(X_val)
mean_squared_error(y_val, y_val_pred)

567.1437236537007

In [None]:
# テストデータの予測
y_test_pred = model.predict(test_dataset2.drop(columns=['y']))

# intに変換
y_test_pred = y_test_pred.astype(int)



[ 56  53  45  59  57  52  54 118  54  97  80  47  61  58  96  91  92  60
  67  56  55  69  72  96  57 112  58  95  94  54  97  94  96 101  84  98
  90  93  96  96]


In [132]:
# 提出用データの作成
submission = pd.DataFrame({
    "datetime": test_dataset["datetime"],
    "y": y_test_pred
})
submission.to_csv("submission.csv", index=False, header=False)