# Python機械学習クックブック

In [2]:
import numpy as np
import pandas as pd

## 文字列の日次データへの変換

In [9]:
# 文字列を作成
date_strings = np.array(
    [
        "03-04-2005 11:35 PM",
        "23-05-2012 1:03 AM",
        "04-08-2009 09:09 PM",
    ]
)

# 日次データに変換
[pd.to_datetime(date,format="%d-%m-%Y %I:%M %p") for date in date_strings]


[Timestamp('2005-04-03 23:35:00'),
 Timestamp('2012-05-23 01:03:00'),
 Timestamp('2009-08-04 21:09:00')]

In [10]:
# エラー処理を追加
date_strings = np.array(
    [
        "03-04-2005 11:35 PM",
        "23-05-2012 1:03 AM",
        "04-08-2009 09:09 PM",
        "04-08-2009 09:09PM",
    ]
)
[pd.to_datetime(date,format="%d-%m-%Y %I:%M %p",errors="coerce") for date in date_strings]

[Timestamp('2005-04-03 23:35:00'),
 Timestamp('2012-05-23 01:03:00'),
 Timestamp('2009-08-04 21:09:00'),
 NaT]

In [11]:
# %Y : 4桁の年。
# %m : 月。0パディ8ング
# %d : 日付。0パディング
# %H : 時(24時制)。ゼロパディング
# %I : 時(12時制)。ゼロパディング
# %p : AMもしくはPM
# %M : 分。ゼロパディング
# %S : 秒。ゼロパディング

## タイムゾーンの取り扱い

In [8]:
# 日時を作成
date = pd.Timestamp("2015-03-31 01:16:49")
print(date)

# タイムゾーンを追加
date_in_london = date.tz_localize("Europe/London")
print(date_in_london)

# タイムゾーンを変更
date_in_tokyo = date_in_london.tz_convert("Asia/Tokyo")
print(date_in_tokyo)

2015-03-31 01:16:49
2015-03-31 01:16:49+01:00
2015-03-31 01:16:49+09:00


## 日時によるデータの抽出

In [22]:
# データフレームを作成
df = pd.DataFrame()

# 日時データを作成
df["date"] = pd.date_range("1/1/2001", periods=100000, freq="H")
print(df)

# 2つの日時の間のデータを抽出
print(df[(df["date"] > "2002-1-1 01:00:00") & (df["date"] <= "2002-1-1 04:00:00")])

# locを使って同じことを行う
df.set_index(df["date"], inplace=True)
print(df.loc["2002-1-1 01:00:00":"2002-1-1 04:00:00"])

                     date
0     2001-01-01 00:00:00
1     2001-01-01 01:00:00
2     2001-01-01 02:00:00
3     2001-01-01 03:00:00
4     2001-01-01 04:00:00
...                   ...
99995 2012-05-29 11:00:00
99996 2012-05-29 12:00:00
99997 2012-05-29 13:00:00
99998 2012-05-29 14:00:00
99999 2012-05-29 15:00:00

[100000 rows x 1 columns]
                    date
8762 2002-01-01 02:00:00
8763 2002-01-01 03:00:00
8764 2002-01-01 04:00:00
                                   date
date                                   
2002-01-01 01:00:00 2002-01-01 01:00:00
2002-01-01 02:00:00 2002-01-01 02:00:00
2002-01-01 03:00:00 2002-01-01 03:00:00
2002-01-01 04:00:00 2002-01-01 04:00:00


## 日付を複数の特徴量に分解

In [39]:

df = pd.DataFrame()
df["date"] = pd.date_range("1/1/2001", periods=150, freq="D")

# 年、月、日、時、分、秒、曜日を特徴量として作成
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["day"] = df["date"].dt.day
df["hour"] = df["date"].dt.hour
df["minute"] = df["date"].dt.minute
df["second"] = df["date"].dt.second
df["weekday"] = df["date"].dt.weekday

df

Unnamed: 0,date,year,month,day,hour,minute,second,weekday
0,2001-01-01,2001,1,1,0,0,0,0
1,2001-01-02,2001,1,2,0,0,0,1
2,2001-01-03,2001,1,3,0,0,0,2
3,2001-01-04,2001,1,4,0,0,0,3
4,2001-01-05,2001,1,5,0,0,0,4
...,...,...,...,...,...,...,...,...
145,2001-05-26,2001,5,26,0,0,0,5
146,2001-05-27,2001,5,27,0,0,0,6
147,2001-05-28,2001,5,28,0,0,0,0
148,2001-05-29,2001,5,29,0,0,0,1


## 日付の差の算出

In [47]:
# データフレームを作成
df = pd.DataFrame()

# 日時データを作成
df["date"] = pd.date_range("1/1/2001", periods=100, freq="H")

# 2つの日時の間のデータを抽出
print(df["date"][1] - df["date"][0])
print(df["date"][24] - df["date"][0])


0 days 01:00:00
1 days 00:00:00


## 時間遅れ特徴量の生成

In [50]:
# データフレームを作成
df = pd.DataFrame()

# 日時データを作成
df["date"] = pd.date_range("1/1/2001", periods=10, freq="D")
df["price"] = np.random.randint(10, 100, size=[10, 1])

# 1日分のラグがある特徴量を作成
df["previous_days_price"] = df["price"].shift(1)

df

Unnamed: 0,date,price,previous_days_price
0,2001-01-01,61,
1,2001-01-02,19,61.0
2,2001-01-03,43,19.0
3,2001-01-04,86,43.0
4,2001-01-05,16,86.0
5,2001-01-06,54,16.0
6,2001-01-07,33,54.0
7,2001-01-08,30,33.0
8,2001-01-09,12,30.0
9,2001-01-10,58,12.0


## 時系列データの欠損値の補完

In [68]:
df = pd.DataFrame()
df["date"] = pd.date_range("1/1/2001", periods=5, freq="D")
df["price"] = np.random.randint(1, 100, size=[5, 1])

# 欠損値を作成
df.iloc[2, 1] = np.nan
print(df)

# 欠損値を前方補完
df_ffill = df.ffill()
print(df_ffill)

# 欠損値を後方補完
df_bfill = df.bfill()
print(df_bfill)

df

        date  price
0 2001-01-01   72.0
1 2001-01-02   51.0
2 2001-01-03    NaN
3 2001-01-04   72.0
4 2001-01-05   88.0
        date  price
0 2001-01-01   72.0
1 2001-01-02   51.0
2 2001-01-03   51.0
3 2001-01-04   72.0
4 2001-01-05   88.0
        date  price
0 2001-01-01   72.0
1 2001-01-02   51.0
2 2001-01-03   72.0
3 2001-01-04   72.0
4 2001-01-05   88.0


Unnamed: 0,date,price
0,2001-01-01,72.0
1,2001-01-02,51.0
2,2001-01-03,
3,2001-01-04,72.0
4,2001-01-05,88.0
