# 문자열을 날짜로 변환하기

In [1]:
import numpy as np
import pandas as pd

# 문자열을 만듭니다.
date_strings = np.array(['03-04-2005 11:35 PM',
                         '23-05-2010 12:01 AM',
                         '04-09-2009 09:09 PM'])

In [3]:
[pd.to_datetime(date, format='%d-%m-%Y %H:%M %p') for date in date_strings]

[Timestamp('2005-04-03 11:35:00'),
 Timestamp('2010-05-23 12:01:00'),
 Timestamp('2009-09-04 09:09:00')]

In [5]:
[pd.to_datetime(date, format='%d-%m-%Y %I:%M %p', errors='ignore') for date in date_strings]

[Timestamp('2005-04-03 23:35:00'),
 Timestamp('2010-05-23 00:01:00'),
 Timestamp('2009-09-04 21:09:00')]

In [6]:
pd.to_datetime(date_strings)

DatetimeIndex(['2005-03-04 23:35:00', '2010-05-23 00:01:00',
               '2009-04-09 21:09:00'],
              dtype='datetime64[ns]', freq=None)

# 시간대 다루기

In [7]:
import pandas as pd

In [10]:
pd.Timestamp('2022-05-01 06:00:00', tz='Asia/Seoul')

Timestamp('2022-05-01 06:00:00+0900', tz='Asia/Seoul')

In [11]:
date = pd.Timestamp('2022-05-01 06:00:00')

In [13]:
date_in_seoul = date.tz_localize('Asia/Seoul')
date_in_seoul

Timestamp('2022-05-01 06:00:00+0900', tz='Asia/Seoul')

In [14]:
date = date.tz_localize(None)
date

Timestamp('2022-05-01 06:00:00')

In [16]:
date_in_seoul.tz_convert('Africa/Abidjan')

Timestamp('2022-04-30 21:00:00+0000', tz='Africa/Abidjan')

In [17]:
dates = pd.Series(pd.date_range('2023/5/5',  periods=3, freq='M'))

In [18]:
dates

0   2023-05-31
1   2023-06-30
2   2023-07-31
dtype: datetime64[ns]

In [19]:
dates.dt.tz_localize('Africa/Abidjan')

0   2023-05-31 00:00:00+00:00
1   2023-06-30 00:00:00+00:00
2   2023-07-31 00:00:00+00:00
dtype: datetime64[ns, Africa/Abidjan]

In [20]:
from pytz import all_timezones

In [26]:
all_timezones

['Africa/Abidjan',
 'Africa/Accra',
 'Africa/Addis_Ababa',
 'Africa/Algiers',
 'Africa/Asmara',
 'Africa/Asmera',
 'Africa/Bamako',
 'Africa/Bangui',
 'Africa/Banjul',
 'Africa/Bissau',
 'Africa/Blantyre',
 'Africa/Brazzaville',
 'Africa/Bujumbura',
 'Africa/Cairo',
 'Africa/Casablanca',
 'Africa/Ceuta',
 'Africa/Conakry',
 'Africa/Dakar',
 'Africa/Dar_es_Salaam',
 'Africa/Djibouti',
 'Africa/Douala',
 'Africa/El_Aaiun',
 'Africa/Freetown',
 'Africa/Gaborone',
 'Africa/Harare',
 'Africa/Johannesburg',
 'Africa/Juba',
 'Africa/Kampala',
 'Africa/Khartoum',
 'Africa/Kigali',
 'Africa/Kinshasa',
 'Africa/Lagos',
 'Africa/Libreville',
 'Africa/Lome',
 'Africa/Luanda',
 'Africa/Lubumbashi',
 'Africa/Lusaka',
 'Africa/Malabo',
 'Africa/Maputo',
 'Africa/Maseru',
 'Africa/Mbabane',
 'Africa/Mogadishu',
 'Africa/Monrovia',
 'Africa/Nairobi',
 'Africa/Ndjamena',
 'Africa/Niamey',
 'Africa/Nouakchott',
 'Africa/Ouagadougou',
 'Africa/Porto-Novo',
 'Africa/Sao_Tome',
 'Africa/Timbuktu',
 'Africa/

In [27]:
import pytz

tz = pytz.timezone('Asia/Seoul')

In [28]:
dates.dt.tz_localize(tz)

0   2023-05-31 00:00:00+09:00
1   2023-06-30 00:00:00+09:00
2   2023-07-31 00:00:00+09:00
dtype: datetime64[ns, Asia/Seoul]

# 날짜와 시간 선택하기

In [29]:
import pandas as pd

In [30]:
df = pd.DataFrame()

In [31]:
df['date'] = pd.date_range('2022/1/1', periods=100000, freq='H')

In [33]:
df[(df['date']>'2023-1-1 01:00:00') & (df['date']<='2023-1-1 04:00:00')]

Unnamed: 0,date
8762,2023-01-01 02:00:00
8763,2023-01-01 03:00:00
8764,2023-01-01 04:00:00


In [34]:
df = df.set_index(df['date'])

In [36]:
df.loc['2022-03-03 01:00:00':'2022-03-04 03:00:00']

Unnamed: 0_level_0,date
date,Unnamed: 1_level_1
2022-03-03 01:00:00,2022-03-03 01:00:00
2022-03-03 02:00:00,2022-03-03 02:00:00
2022-03-03 03:00:00,2022-03-03 03:00:00
2022-03-03 04:00:00,2022-03-03 04:00:00
2022-03-03 05:00:00,2022-03-03 05:00:00
2022-03-03 06:00:00,2022-03-03 06:00:00
2022-03-03 07:00:00,2022-03-03 07:00:00
2022-03-03 08:00:00,2022-03-03 08:00:00
2022-03-03 09:00:00,2022-03-03 09:00:00
2022-03-03 10:00:00,2022-03-03 10:00:00


# 날짜 데이터를 여러 특성으로 나누기

In [37]:
import pandas as pd

In [38]:
df = pd.DataFrame()

In [39]:
df['date'] = pd.date_range('2021-01-01', periods=150, freq='W')

In [40]:
df

Unnamed: 0,date
0,2021-01-03
1,2021-01-10
2,2021-01-17
3,2021-01-24
4,2021-01-31
...,...
145,2023-10-15
146,2023-10-22
147,2023-10-29
148,2023-11-05


In [41]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute

In [44]:
df.head(3)

Unnamed: 0,date,year,month,day,hour,minute
0,2021-01-03,2021,1,3,0,0
1,2021-01-10,2021,1,10,0,0
2,2021-01-17,2021,1,17,0,0


# 날짜 간의 차이 계산

In [45]:
import pandas as pd

In [46]:
df = pd.DataFrame()

In [47]:
df['Arrived'] = [pd.Timestamp('2023-01-01'), pd.Timestamp('2023-01-04')]

In [48]:
df['Left'] = [pd.Timestamp('2023-01-01'), pd.Timestamp('2023-01-06')]

In [49]:
df['Left'] - df['Arrived']

0   0 days
1   2 days
dtype: timedelta64[ns]

In [50]:
pd.Series(delta.days for delta in (df['Left'] - df['Arrived']))

0    0
1    2
dtype: int64

In [51]:
[delta.days for delta in df['Left'] - df['Arrived']]

[0, 2]

# 요일 인코딩

In [52]:
import pandas as pd

In [53]:
df = pd.Series(pd.date_range('2022-01-01', periods=3, freq='M'))
df

0   2022-01-31
1   2022-02-28
2   2022-03-31
dtype: datetime64[ns]

In [55]:
df.dt.weekday

0    0
1    0
2    3
dtype: int64

# 시차 특성 만들기

In [57]:
import pandas as pd

In [58]:
df = pd.DataFrame()

In [61]:
df['dates'] = pd.date_range('2022-01-01', periods=5, freq='D')
df['stock_price'] = [1.1, 2.2, 3.3, 4.4, 5.5]

In [62]:
df['previous_days_stock_price'] = df['stock_price'].shift(1)
df

Unnamed: 0,dates,stock_price,previous_days_stock_price
0,2022-01-01,1.1,
1,2022-01-02,2.2,1.1
2,2022-01-03,3.3,2.2
3,2022-01-04,4.4,3.3
4,2022-01-05,5.5,4.4


# 이동시간 윈도우 사용

In [63]:
import pandas as pd

In [64]:
time_index = pd.date_range('2021-01-01', periods=5, freq='M')

In [65]:
df = pd.DataFrame(index=time_index)

In [66]:
df['stock_price'] = [1, 2, 3, 4, 5]

In [67]:
df.rolling(window=2).mean()

Unnamed: 0,stock_price
2021-01-31,
2021-02-28,1.5
2021-03-31,2.5
2021-04-30,3.5
2021-05-31,4.5


In [69]:
df.ewm(alpha=0.5).mean()

Unnamed: 0,stock_price
2021-01-31,1.0
2021-02-28,1.666667
2021-03-31,2.428571
2021-04-30,3.266667
2021-05-31,4.16129


# 시계열 데이터에서 결측치

In [70]:
import pandas as pd
import numpy as np

In [72]:
time_index = pd.date_range('2022-01-01', periods=5, freq='M')

In [73]:
df = pd.DataFrame(index=time_index)

In [75]:
df['sales'] = [1.0, 2.0, np.nan, np.nan, 5.0]

In [77]:
df

Unnamed: 0,sales
2022-01-31,1.0
2022-02-28,2.0
2022-03-31,
2022-04-30,
2022-05-31,5.0


In [78]:
df.interpolate()

Unnamed: 0,sales
2022-01-31,1.0
2022-02-28,2.0
2022-03-31,3.0
2022-04-30,4.0
2022-05-31,5.0


In [79]:
df.ffill()

Unnamed: 0,sales
2022-01-31,1.0
2022-02-28,2.0
2022-03-31,2.0
2022-04-30,2.0
2022-05-31,5.0


In [80]:
df.bfill()

Unnamed: 0,sales
2022-01-31,1.0
2022-02-28,2.0
2022-03-31,5.0
2022-04-30,5.0
2022-05-31,5.0


In [81]:
df.interpolate(method='quadratic')



Unnamed: 0,sales
2022-01-31,1.0
2022-02-28,2.0
2022-03-31,3.059808
2022-04-30,4.038069
2022-05-31,5.0


In [82]:
df.interpolate(limit=1, limit_direction='forward')

Unnamed: 0,sales
2022-01-31,1.0
2022-02-28,2.0
2022-03-31,3.0
2022-04-30,
2022-05-31,5.0
