In [8]:
import pandas as pd
import numpy as np
import json

**DJIA**

In [9]:
df = pd.read_csv('data/DJIA_data.csv')
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2006-12-28,12510.570313,12529.879883,12478.129883,12501.519531,12501.519531,126740000
1,2006-12-29,12500.480469,12526.030273,12451.129883,12463.150391,12463.150391,161560000
2,2007-01-03,12459.540039,12580.349609,12404.820313,12474.519531,12474.519531,327200000
3,2007-01-04,12473.160156,12510.410156,12403.860352,12480.69043,12480.69043,259060000
4,2007-01-05,12480.049805,12480.129883,12365.410156,12398.009766,12398.009766,235220000


In [10]:
df = df[['Date', 'Close','Adj Close']]
df.Date = pd.to_datetime(df.Date)
df = df.set_index('Date')
df.head()

Unnamed: 0_level_0,Close,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2006-12-28,12501.519531,12501.519531
2006-12-29,12463.150391,12463.150391
2007-01-03,12474.519531,12474.519531
2007-01-04,12480.69043,12480.69043
2007-01-05,12398.009766,12398.009766


In [11]:
idx = pd.date_range('2006-12-28', '2016-12-31')
df = df.reindex(idx, fill_value=np.NaN) # fill_value : reindex 과정에서 생긴 결측값 채우기
df.head()

Unnamed: 0,Close,Adj Close
2006-12-28,12501.519531,12501.519531
2006-12-29,12463.150391,12463.150391
2006-12-30,,
2006-12-31,,
2007-01-01,,


### 결측값 보간
시계열 데이터 분석이라든지 이미지 분석 등에서 사용하면 매우 유용하고 편리한 method 입니다.
이전 포스팅의 결측값 대체는 '특정의 동일 값'으로 채우는 방식(filling, imputation)이었던 반면에, 이번 포스팅의 결측값 보간(interploation)은 실측값과 실측값 사이의 결측값을 마치 '그라데이션(gradation)' 기법으로 색깔을 조금씩 변화시켜가면서 부드럽게 채워나가는 방법
<img src="https://t1.daumcdn.net/cfile/tistory/27380C3F584C23A617" />
참고 : https://rfriend.tistory.com/264

In [12]:
df = df.interpolate() # interpolate() : 결측값을 보간
df.head()

Unnamed: 0,Close,Adj Close
2006-12-28,12501.519531,12501.519531
2006-12-29,12463.150391,12463.150391
2006-12-30,12465.424219,12465.424219
2006-12-31,12467.698047,12467.698047
2007-01-01,12469.971875,12469.971875


In [13]:
df.tail()

Unnamed: 0,Close,Adj Close
2016-12-27,19945.039063,19945.039063
2016-12-28,19833.679688,19833.679688
2016-12-29,19819.779297,19819.779297
2016-12-30,19819.779297,19819.779297
2016-12-31,19819.779297,19819.779297


**NYT**

In [14]:
years = range(2007, 2016)
months = range(1,13)
dict_keys = ['type_of_material','headline.main','pub_date','section_name','news_desk']
s_list = ['business', 'national', 'world', 'u.s.' , 'politics', 'opinion', 'tech', 'science',  'health']

NYT_final = pd.DataFrame()
for year in years:
    for month in months:
        file_str = 'data/nytimes'+str(year) + '-' + '{:02}'.format(month) + '.json'
        with open(file_str) as f:
            NYT = json.load(f)
        NYT = NYT['response']['docs']
        main_df = pd.DataFrame()
        for article in NYT:
            temp = pd.io.json.json_normalize(article)
            temp = temp.filter(items=dict_keys)
            main_df = pd.concat([main_df, temp])
        main_df.pub_date = pd.to_datetime(main_df.pub_date).dt.date
        main_df.section_name = main_df.section_name.fillna('UNK').str.lower()
        main_df = main_df[main_df.section_name.str.contains('|'.join(s_list))]
        main_df = main_df.set_index('pub_date')
        NYT_final = pd.concat([NYT_final,main_df])

  from ipykernel import kernelapp as app


KeyboardInterrupt: 

In [None]:
NYT_final.to_pickle('data/pickled_NYT.pkl')

In [None]:
headline = NYT_final.groupby(NYT_final.index)['headline.main'].agg(lambda x: ' '.join(x.astype('str'))).reset_index()
headline.head()

In [None]:
headline.tail()

In [None]:
NYT_df = pd.read_pickle('data/pickled_NYT.pkl')
NYT_df.head()

In [None]:
NYT_df.tail()

In [None]:
NYT_df.shape

In [None]:
headline = NYT_df.groupby(NYT_df.index)['headline.main'].agg(lambda x: ' '.join(x.astype('str'))).reset_index()
headline.head()

In [None]:
df.head()

### data/raw_sample.pkl 보완

In [None]:
df2 = df.join(headline.set_index('pub_date')).dropna()
df2.to_pickle('data/raw_sample.pkl')

In [None]:
df_sample = pd.read_pickle('data/raw_sample.pkl')
df_sample.head()