In [42]:
import pandas as pd
import numpy as np
import seaborn as sb
import multiprocessing
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
import random
from xgboost import XGBRegressor, plot_importance
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import RFE
import pickle

In [28]:
df = pd.read_csv('data/hscore.csv')
cores = multiprocessing.cpu_count() - 1

In [29]:
df.head()

Unnamed: 0,h_score,sales,cost_of_sales,operating_profit,exceptional_items,interest_income_and_other,interest_expense,tax,other_post_tax,dividends,...,accounts_payable,other_current_liabilities,group_balances_payable,long_term_debt,long_liabilities,other_long_term_liabilities,minority_interest,total_shareholders_equity,retained_earnings,industry_code_id
0,30.0,34972079.0,-26230862.0,-997323.0,0.0,-108425.0,-0.0,180927.0,0.0,0.0,...,3019820.0,9076305.0,0.0,0.0,0.0,0.0,0.0,10072754.0,0.0,573.0
1,77.0,42426695.0,-0.0,4184573.0,0.0,244.0,-20.0,-0.0,-422587.0,0.0,...,6533160.0,6886774.0,0.0,106.0,0.0,957137.0,0.0,6795091.0,0.0,796.0
2,74.0,46654061.0,-0.0,4761506.0,0.0,5075.0,-768.0,-0.0,-561247.0,0.0,...,7229508.0,9692861.0,0.0,0.0,0.0,947137.0,0.0,7237447.0,0.0,796.0
3,74.0,41124364.0,-0.0,4377908.0,0.0,-1279.0,-0.0,-0.0,-575168.0,0.0,...,7578606.0,6860763.0,0.0,296734.0,0.0,909137.0,0.0,6834342.0,0.0,796.0
4,90.0,53801000.0,-0.0,5910000.0,0.0,-745000.0,-182000.0,-0.0,0.0,0.0,...,4498000.0,1732000.0,0.0,0.0,0.0,52000.0,0.0,23208000.0,0.0,573.0


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 32 columns):
h_score                                       6040 non-null float64
sales                                         6040 non-null float64
cost_of_sales                                 6040 non-null float64
operating_profit                              6040 non-null float64
exceptional_items                             6016 non-null float64
interest_income_and_other                     6040 non-null float64
interest_expense                              6040 non-null float64
tax                                           6040 non-null float64
other_post_tax                                6040 non-null float64
dividends                                     6040 non-null float64
depreciation                                  6040 non-null float64
amortisation_and_impairment_of_intangibles    6040 non-null float64
cash_and_equivalents                          6040 non-null float64
short_ter

sb.distplot(df['h_score'])

names = list(df.columns)
names.remove('h_score')

for name in names:
    plt.figure()
    sb.regplot(y="h_score", x=name, data=df)

corr = df.corr()
plt.figure(figsize=(15, 12))
sb.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True)

In [31]:
df.isnull().sum()

h_score                                         0
sales                                           0
cost_of_sales                                   0
operating_profit                                0
exceptional_items                              24
interest_income_and_other                       0
interest_expense                                0
tax                                             0
other_post_tax                                  0
dividends                                       0
depreciation                                    0
amortisation_and_impairment_of_intangibles      0
cash_and_equivalents                            0
short_term_investments                          0
accounts_receivable                             0
other_current_receivables                       0
group_balances_receivable                       0
inventory                                       0
intangible_assets                               0
fixed_assets                                    0


In [32]:
df = df.loc[~df['exceptional_items'].isnull()]

In [33]:
df['industry_code_id'].fillna(int(random.random()*1000), inplace=True)

In [34]:
df.isnull().sum()

h_score                                       0
sales                                         0
cost_of_sales                                 0
operating_profit                              0
exceptional_items                             0
interest_income_and_other                     0
interest_expense                              0
tax                                           0
other_post_tax                                0
dividends                                     0
depreciation                                  0
amortisation_and_impairment_of_intangibles    0
cash_and_equivalents                          0
short_term_investments                        0
accounts_receivable                           0
other_current_receivables                     0
group_balances_receivable                     0
inventory                                     0
intangible_assets                             0
fixed_assets                                  0
other_non_current_assets                

In [35]:
y = df['h_score']
df.drop('h_score', axis=1, inplace=True)
X = df

### XGB

In [38]:
xgb = XGBRegressor(n_jobs = cores, n_estimators = 1000)
xgb_scores = cross_val_score(xgb, X, y, cv=10, scoring='neg_mean_squared_error')
print("XGB accuracy: %0.2f (+/- %0.2f)" % (xgb_scores.mean(), xgb_scores.std() * 2))

XGB accuracy: -118.14 (+/- 39.93)


### XGB polynomial

Additional research of polinomials shown that industry_code_id is important secondary feature. Lets make few combinations of it with highly important first level features

=> XGB polynomial accuracy: -120.75 (+/- 43.13)

### XGB with RFE

In [39]:
xgb_rfe = XGBRegressor(n_jobs = cores, n_estimators = 1000)

In [40]:
selector = RFE(xgb_rfe, n_features_to_select=18, step=1)
xgb_rfe_scores = cross_val_score(selector, X, y, cv=10, scoring='neg_mean_squared_error')
print("XGB with RFE accuracy: %0.2f (+/- %0.2f)" % (xgb_rfe_scores.mean(), xgb_rfe_scores.std() * 2))

XGB with RFE accuracy: -117.48 (+/- 39.76)


So with 18 features from 31 we have same quality. Lets remove all unnecessary features from data set by using final version of XGB with applied RFE.

In [43]:
selector.fit(X, y)

RFE(estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=9, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
  n_features_to_select=18, step=1, verbose=0)

In [44]:
fileObject = open("big_model", 'wb')
pickle.dump(selector, fileObject)
fileObject.close()

In [48]:
xgb = selector.predict(X)
compare = pd.DataFrame({'companywatch':y, 'ml-score':xgb})
compare.head()

Unnamed: 0,companywatch,ml-score
0,30.0,24.782354
1,77.0,79.032509
2,74.0,75.001007
3,74.0,77.165871
4,90.0,84.362801
