# このノートブックは？
コンペで与えられた（inputディレクトリ内の）データに対して基本的な加工をする（特徴量は作成しない）

## Google Driveのマウント

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/'My Drive'/'PROBSPACE'/'StockPricePrediction'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/PROBSPACE/StockPricePrediction


## データの読み込み

In [2]:
import pandas as pd
import numpy as np
from src.common_functions import Config

In [3]:
price_df = pd.read_csv(f'{Config.input_dir_name}/train_data.csv', index_col=0, parse_dates=True).dropna(how='all')
company_df = pd.read_csv(f'{Config.input_dir_name}/company_list.csv')

## 各データの加工

In [4]:
def organize_company(price_df, company_df):
  company_df2 = company_df.rename(columns={'Symbol': 'id'})
  not_exist = list(price_df.columns[~price_df.columns.isin(company_df2.id)])
  for col in not_exist:
    company_df2 = company_df2.append({'id':col}, ignore_index=True)
  company_df2 = company_df2[company_df2.id.isin(price_df.columns)]
  
  company_df2['List1'] = company_df2[['id', 'List']].groupby('id').transform(lambda x: x.iloc[0])
  company_df2['List2'] = company_df2[['id', 'List']].groupby('id').transform(lambda x: x.iloc[-1])
  company_df2 = company_df2.drop('List', axis=1).drop_duplicates(subset='id').reset_index(drop=True)

  return company_df2

company_df2 = organize_company(price_df, company_df)

In [5]:
def organize_price(price_df, company_df):
  # 価格をlog化
  res = price_df.apply(np.log1p)
  res.loc[pd.to_datetime(Config.submission_date), :] = np.nan

  # Unpivot
  res = res.stack(dropna=False).reset_index()
  res.columns=['Date', 'id', 'y']

  # 目的変数追加
  res['y_prev'] = res[['id', 'y']].groupby('id')['y'].transform(lambda x: x.shift(1).fillna(method='bfill'))
  res['y_diff'] = res['y'] - res['y_prev']
  res['y_diff_std'] = res[['id', 'y']].groupby('id')['y'].transform(lambda x: x.std())
  res['y_diff_norm'] = res['y_diff'] / res['y_diff_std']

  # company_dfと結合
  res = res.merge(company_df.drop('Name', axis=1), on='id')

  return res


price_df2 = organize_price(price_df, company_df2)

## ファイル出力

In [6]:
price_df2.to_csv(f'{Config.initial_dir_name}/train_data2.csv.gzip', compression='gzip')
company_df2.to_csv(f'{Config.initial_dir_name}/company_list2.csv.gzip', compression='gzip')