In [129]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import sklearn as sk
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from tools.data_tools import get_files, read_file

from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls


from sklearn.svm import SVC
from sklearn.cross_validation import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

In [34]:
g = get_files()

In [35]:
def get_data(filename, flist, ticker_symbol, report_type):
    dataframe = read_file(filename)
#     dataframe = dataframe[dataframe['TICKER_SYMBOL'] == ticker_symbol][flist].sort_values(by='END_DATE')
    dataframe = dataframe[flist].sort_values(by='PUBLISH_DATE')
    dataframe = dataframe.drop_duplicates(subset=['TICKER_SYMBOL','END_DATE'], keep='last').reset_index(drop=True)
    dataframe = dataframe[dataframe['REPORT_TYPE'] == report_type]
    dataframe = dataframe.set_index('TICKER_SYMBOL')
    
    return dataframe

In [36]:
cfs_data = get_data('CashFlowStatement-GeneralBusiness.csv',
                   ['TICKER_SYMBOL', 'REPORT_TYPE', 'PUBLISH_DATE',
                    'END_DATE', 'C_FR_SALE_G_S',
                    'N_CHANGE_IN_CASH', 'N_CE_BEG_BAL', 'N_CE_END_BAL'],
                    '000002', 'S1').drop(['REPORT_TYPE', 'PUBLISH_DATE'],axis=1).fillna(method='ffill').fillna(method='bfill')
bs_data = get_data('BalanceSheet-GeneralBusiness.csv',
                  ['TICKER_SYMBOL', 'REPORT_TYPE', 'PUBLISH_DATE',
                   'END_DATE', 'CASH_C_EQUIV',
                   'T_EQUITY_ATTR_P', 'MINORITY_INT', 'T_SH_EQUITY',
                   'T_LIAB_EQUITY'],
                   '000002', 'S1').drop(['REPORT_TYPE', 'PUBLISH_DATE'],axis=1).fillna(method='ffill').fillna(method='bfill')
is_data = get_data('IncomeStatement-GeneralBusiness.csv',
                  ['TICKER_SYMBOL', 'END_DATE', 'REPORT_TYPE', 'PUBLISH_DATE', 'REVENUE'],
                  '000002', 'S1').drop(['REPORT_TYPE', 'PUBLISH_DATE'],axis=1).fillna(method='ffill').fillna(method='bfill')

In [37]:
mask = (cfs_data['END_DATE'] >= '2008-01-01') & (cfs_data['END_DATE'] <= '2016-06-30')
train_cfs_data = cfs_data.loc[mask]
mask = (bs_data['END_DATE'] >= '2008-01-01') & (bs_data['END_DATE'] <= '2016-06-30')
train_bs_data = bs_data.loc[mask]
mask = (is_data['END_DATE'] >= '2008-01-01') & (is_data['END_DATE'] <= '2016-06-30')
train_is_data = is_data.loc[mask]

mask = (cfs_data['END_DATE'] == '2017-06-30')
test_cfs_data = cfs_data.loc[mask]
mask = (bs_data['END_DATE'] == '2017-06-30')
test_bs_data = bs_data.loc[mask]
mask = (is_data['END_DATE'] == '2017-06-30')
test_is_data = is_data.loc[mask]

In [38]:
mask = (bs_data['END_DATE'] == '2009-06-30')
tmp_data = bs_data.loc[mask]
tmp_data['END_DATE'] = pd.to_datetime('2008-06-30')

train_bs_data = train_bs_data.append(tmp_data)

In [44]:
# from 2008-06-30 ~ 2016-06-30 target
# y = train_is_data.drop('END_DATE', axis=1)
y = train_is_data

In [110]:
train_data = pd.merge(train_cfs_data, train_bs_data, on=['TICKER_SYMBOL', 'END_DATE'])
train_data = pd.merge(train_data, train_is_data, on=['TICKER_SYMBOL', 'END_DATE'])
train_data = train_data.fillna(method='ffill')
train_data = train_data.drop('END_DATE', axis=1)

test_data = pd.merge(test_cfs_data, test_bs_data, on=['TICKER_SYMBOL', 'END_DATE'])
test_data = test_data.drop('END_DATE', axis=1)
test_data = test_data.fillna(method='ffill')

In [111]:
# train_data = train_data.loc['002248']
train_data.shape

(21190, 10)

In [118]:
x_train = train_data.drop('REVENUE', axis=1).reset_index(drop=True).values # train data
y_train = pd.DataFrame(data=train_data['REVENUE']).reset_index(drop=True) # target data
y_train = y_train['REVENUE'].ravel()

In [125]:
x_train.shape, y_train.shape

((21190, 9), (21190,))

In [93]:
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

In [None]:
rf = RandomForestClassifier(**rf_params) 
scores = cross_val_score(rf, x_train, y_train.astype('int'), cv=5)


In [None]:
scores