In [30]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import sklearn as sk
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from tools.data_tools import get_files, read_file

from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls


from sklearn.svm import SVC


In [6]:
g = get_files()

Downloading data from https://github.com/suzg/fddc2018-01-data/releases/download/20180529/BalanceSheet-Bank.tar.gz
Downloading data from https://github.com/suzg/fddc2018-01-data/releases/download/20180529/BalanceSheet-GeneralBusiness.tar.gz
Downloading data from https://github.com/suzg/fddc2018-01-data/releases/download/20180529/BalanceSheet-Insurance.tar.gz
Downloading data from https://github.com/suzg/fddc2018-01-data/releases/download/20180529/BalanceSheet-Securities.tar.gz
Downloading data from https://github.com/suzg/fddc2018-01-data/releases/download/20180529/CashFlowStatement-Bank.tar.gz
Downloading data from https://github.com/suzg/fddc2018-01-data/releases/download/20180529/CashFlowStatement-GeneralBusiness.tar.gz
Downloading data from https://github.com/suzg/fddc2018-01-data/releases/download/20180529/CashFlowStatement-Insurance.tar.gz
Downloading data from https://github.com/suzg/fddc2018-01-data/releases/download/20180529/CashFlowStatement-Securities.tar.gz
Downloading data

In [7]:
g

{'BalanceSheet-Bank.csv': '/home/lijing/.keras/fddc2018-01/BalanceSheet-Bank.csv',
 'BalanceSheet-GeneralBusiness.csv': '/home/lijing/.keras/fddc2018-01/BalanceSheet-GeneralBusiness.csv',
 'BalanceSheet-Insurance.csv': '/home/lijing/.keras/fddc2018-01/BalanceSheet-Insurance.csv',
 'BalanceSheet-Securities.csv': '/home/lijing/.keras/fddc2018-01/BalanceSheet-Securities.csv',
 'CashFlowStatement-Bank.csv': '/home/lijing/.keras/fddc2018-01/CashFlowStatement-Bank.csv',
 'CashFlowStatement-GeneralBusiness.csv': '/home/lijing/.keras/fddc2018-01/CashFlowStatement-GeneralBusiness.csv',
 'CashFlowStatement-Insurance.csv': '/home/lijing/.keras/fddc2018-01/CashFlowStatement-Insurance.csv',
 'CashFlowStatement-Securities.csv': '/home/lijing/.keras/fddc2018-01/CashFlowStatement-Securities.csv',
 'CompanyOperation.csv': '/home/lijing/.keras/fddc2018-01/CompanyOperation.csv',
 'IncomeStatement-Bank.csv': '/home/lijing/.keras/fddc2018-01/IncomeStatement-Bank.csv',
 'IncomeStatement-GeneralBusiness.csv'

In [8]:
def get_data(filename, flist, ticker_symbol, report_type):
    dataframe = read_file(filename)
    dataframe = dataframe[dataframe['TICKER_SYMBOL'] == ticker_symbol][flist].sort_values(by='END_DATE')
    dataframe = dataframe.drop_duplicates('END_DATE', keep='last').reset_index(drop=True)
    dataframe = dataframe[dataframe['REPORT_TYPE'] == report_type]
    
    return dataframe

In [9]:
cfs_data = get_data('CashFlowStatement-GeneralBusiness.csv',
                   ['TICKER_SYMBOL', 'REPORT_TYPE',
                    'END_DATE', 'C_FR_SALE_G_S',
                    'N_CHANGE_IN_CASH', 'N_CE_BEG_BAL', 'N_CE_END_BAL'],
                    '000002', 'S1').reset_index(drop=True)
bs_data = get_data('BalanceSheet-GeneralBusiness.csv',
                  ['TICKER_SYMBOL', 'REPORT_TYPE',
                   'END_DATE', 'CASH_C_EQUIV',
                   'T_EQUITY_ATTR_P', 'MINORITY_INT', 'T_SH_EQUITY',
                   'T_LIAB_EQUITY'],
                   '000002', 'S1').reset_index(drop=True)
is_data = get_data('IncomeStatement-GeneralBusiness.csv',
                  ['TICKER_SYMBOL', 'END_DATE', 'REPORT_TYPE', 'REVENUE'],
                  '000002', 'S1').reset_index(drop=True)

In [10]:

test_data = cfs_data[9:].drop(['TICKER_SYMBOL', 'REPORT_TYPE', 'END_DATE'], axis=1).reset_index(drop=True)
test_data = test_data.join(bs_data[8:].drop(['TICKER_SYMBOL', 'REPORT_TYPE', 'END_DATE'], axis=1).reset_index(drop=True))

test_data

Unnamed: 0,C_FR_SALE_G_S,N_CHANGE_IN_CASH,N_CE_BEG_BAL,N_CE_END_BAL,CASH_C_EQUIV,T_EQUITY_ATTR_P,MINORITY_INT,T_SH_EQUITY,T_LIAB_EQUITY
0,151399100000.0,20193140000.0,79490010000.0,99683160000.0,107563200000.0,112640700000.0,48517100000.0,161157800000.0,929265900000.0


In [11]:
cfs_data = cfs_data.drop([0, 9]).reset_index(drop=True)
is_data = is_data.drop([0, 9]).reset_index(drop=True)
bs_data = bs_data.drop(8).reset_index(drop=True)

In [12]:
# from 2008-06-30 ~ 2016-06-30
y = is_data.drop(['TICKER_SYMBOL', 'END_DATE', 'REPORT_TYPE'], axis=1)

In [13]:
# create train data

train_data = cfs_data.drop(['TICKER_SYMBOL', 'REPORT_TYPE', 'END_DATE'], axis=1)
# train_data = train_data.append(bs_data.drop(['TICKER_SYMBOL', 'REPORT_TYPE', 'END_DATE'], axis=1))
train_data = train_data.join(bs_data.drop(['TICKER_SYMBOL', 'REPORT_TYPE', 'END_DATE'], axis=1))
train_data = train_data.join(y)

train_data

Unnamed: 0,C_FR_SALE_G_S,N_CHANGE_IN_CASH,N_CE_BEG_BAL,N_CE_END_BAL,CASH_C_EQUIV,T_EQUITY_ATTR_P,MINORITY_INT,T_SH_EQUITY,T_LIAB_EQUITY,REVENUE
0,27326850000.0,6902138000.0,19978290000.0,26880420000.0,26880420000.0,34521270000.0,7283670000.0,41804940000.0,124519900000.0,21808650000.0
1,28870520000.0,-4003392000.0,22002770000.0,17999380000.0,19111310000.0,39772950000.0,8646855000.0,48419810000.0,160512700000.0,16766330000.0
2,50406830000.0,4949835000.0,35096940000.0,40046770000.0,40779470000.0,46204530000.0,11602950000.0,57807480000.0,260960300000.0,19988840000.0
3,48164280000.0,12105620000.0,33614110000.0,45719730000.0,47012190000.0,55099820000.0,14734160000.0,69833970000.0,330401200000.0,30722990000.0
4,67220440000.0,-14872710000.0,51120220000.0,36247510000.0,37603630000.0,66644630000.0,21420850000.0,88065480000.0,432242000000.0,41390350000.0
5,65222810000.0,-1291825000.0,43004150000.0,41712320000.0,42861720000.0,76888470000.0,28831410000.0,105719900000.0,501774300000.0,40961900000.0
6,75712390000.0,-17882790000.0,61653320000.0,43770530000.0,44612560000.0,87272690000.0,30527160000.0,117799900000.0,536937600000.0,50266800000.0
7,127349400000.0,19159590000.0,51747620000.0,70907210000.0,71867990000.0,97460730000.0,40713320000.0,138174000000.0,712306700000.0,74795290000.0


In [None]:
# colormap = plt.cm.RdBu
# plt.figure(figsize=(14,12))
# plt.title('Pearson Correlation of Features', y=1.05, size=15)
# sns.heatmap(train_data.astype(float).corr(),linewidths=0.1,vmax=1.0, 
#             square=True, cmap=colormap, linecolor='white', annot=True)

In [37]:
y_train = train_data['REVENUE'].ravel()
train = train_data.drop(['REVENUE'], axis=1)
x_train = train.values # Creates an array of the train data
x_test = test_data.values # Creats an array of the test data

In [49]:
y_train.astype('int')

array([21808652427, 16766330449, 19988838077, 30722991215, 41390345567,
       40961902094, 50266797992, 74795294306])

In [16]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [53]:
# Some useful parameters which will come in handy later on
ntrain = train_data.shape[0]
ntest = test_data.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 4 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train.astype('int'))

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)

In [20]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [57]:
# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

In [58]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost
svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) # Support Vector Classifier

In [60]:
rf_feature = rf.feature_importances(x_train,y_train.astype('int'))
et_feature = et.feature_importances(x_train, y_train.astype('int'))
ada_feature = ada.feature_importances(x_train, y_train.astype('int'))
gb_feature = gb.feature_importances(x_train,y_train.astype('int'))

[0.077 0.086 0.074 0.095 0.098 0.069 0.073 0.081 0.105]
[0.08033333 0.12666667 0.10933333 0.05566667 0.077      0.117
 0.12033333 0.13366667 0.118     ]
[0.048 0.314 0.112 0.14  0.106 0.062 0.076 0.064 0.078]
[0.01820363 0.06478902 0.01753671 0.02239344 0.02153682 0.01149377
 0.01144917 0.01151205 0.0143354 ]


In [62]:
rf_features = [0.077, 0.086, 0.074, 0.095, 0.098, 0.069, 0.073, 0.081, 0.105]
et_features = [0.08033333, 0.12666667, 0.10933333, 0.05566667, 0.077,      0.117,
 0.12033333, 0.13366667, 0.118     ]
ada_features = [0.048, 0.314, 0.112, 0.14,  0.106, 0.062, 0.076, 0.064, 0.078]
gb_features = [0.01820363, 0.06478902, 0.01753671, 0.02239344, 0.02153682, 0.01149377,
 0.01144917, 0.01151205, 0.0143354 ]

In [63]:
cols = train.columns.values
# Create a dataframe with features
feature_dataframe = pd.DataFrame( {'features': cols,
     'Random Forest feature importances': rf_features,
     'Extra Trees  feature importances': et_features,
      'AdaBoost feature importances': ada_features,
    'Gradient Boost feature importances': gb_features
    })

In [64]:
# Create the new column containing the average of values

feature_dataframe['mean'] = feature_dataframe.mean(axis= 1) # axis = 1 computes the mean row-wise
feature_dataframe.head(3)

Unnamed: 0,features,Random Forest feature importances,Extra Trees feature importances,AdaBoost feature importances,Gradient Boost feature importances,mean
0,C_FR_SALE_G_S,0.077,0.080333,0.048,0.018204,0.055884
1,N_CHANGE_IN_CASH,0.086,0.126667,0.314,0.064789,0.147864
2,N_CE_BEG_BAL,0.074,0.109333,0.112,0.017537,0.078218


In [65]:
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
     'AdaBoost': ada_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel()
    })
base_predictions_train.head()

Unnamed: 0,RandomForest,ExtraTrees,AdaBoost,GradientBoost
0,19988840000.0,19988840000.0,41390350000.0,19988840000.0
1,19988840000.0,19988840000.0,41390350000.0,19988840000.0
2,16766330000.0,21808650000.0,40961900000.0,21808650000.0
3,21808650000.0,21808650000.0,40961900000.0,74795290000.0
4,30722990000.0,50266800000.0,19988840000.0,50266800000.0


In [67]:
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)

In [68]:
gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict(x_test)

In [69]:
predictions

array([1.67663304e+10])

In [71]:
is_data = get_data('IncomeStatement-GeneralBusiness.csv',
                  ['TICKER_SYMBOL', 'END_DATE', 'REPORT_TYPE', 'REVENUE'],
                  '000002', 'S1').reset_index(drop=True)

In [72]:
is_data

Unnamed: 0,TICKER_SYMBOL,END_DATE,REPORT_TYPE,REVENUE
0,2,2008-06-30,S1,17255010000.0
1,2,2009-06-30,S1,21808650000.0
2,2,2010-06-30,S1,16766330000.0
3,2,2011-06-30,S1,19988840000.0
4,2,2012-06-30,S1,30722990000.0
5,2,2013-06-30,S1,41390350000.0
6,2,2014-06-30,S1,40961900000.0
7,2,2015-06-30,S1,50266800000.0
8,2,2016-06-30,S1,74795290000.0
9,2,2017-06-30,S1,69810480000.0
