In [1]:
import numpy as np
import pandas as pd

%matplotlib inline 

from fnmatch import fnmatch


import matplotlib.pyplot as plt


# set some nicer defaults for matplotlib
from matplotlib import rcParams

#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (20, 10)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = True
rcParams['axes.facecolor'] = '#eeeeee'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'



In [2]:
data = pd.read_csv('../data/MaterialInputDataMachineLearning.txt', delim_whitespace=True, header=None)
result = pd.read_csv('../data/OutputDataMachineLearning.txt', delim_whitespace=True, header=None)

In [3]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6
0,1,9744375000.0,0.356487,526664100000.0,0.279271,0.03,0.3
1,2,6488627000.0,0.302243,123211300000.0,0.137124,0.03,0.3
2,3,1963719000.0,0.330547,458857200000.0,0.315956,0.03,0.3
3,4,9960097000.0,0.382041,145596300000.0,0.123606,0.03,0.3
4,5,6437510000.0,0.34852,864565200000.0,0.129599,0.03,0.3


In [4]:
data = data[[1,2,3,4,6]]

In [5]:
result[4] = 0

In [6]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 200)

In [7]:
from sklearn.cross_validation import cross_val_score

np.mean(cross_val_score(rf,data, result[1], cv = 4))
#rf.fit(data[[2,3,4,6]], result[1])



0.93161181969234808

In [8]:
result.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1,26646100000.0,13065000000.0,279349000.0,0,26760400000.0,413994000.0,0.0,0.0,6373300000.0
1,2,14166300000.0,5466300000.0,116244000.0,0,14217200000.0,202405000.0,0.0,0.0,4154900000.0
2,3,4963860000.0,2177040000.0,56934400.0,0,4988410000.0,84753000.0,0.0,0.0,1310140000.0
3,4,28257000000.0,15149200000.0,272210000.0,0,28331000000.0,400427000.0,0.0,0.0,6193700000.0
4,5,17237400000.0,8116000000.0,186127000.0,0,17318500000.0,276928000.0,0.0,0.0,4274870000.0


In [9]:
for i in [1,2,3,5,6,9]:
    print i,"  ",np.mean(cross_val_score(rf,data, result[i], cv = 4))

1    0.929420671841
2    0.949611321663
3    0.846643278556
5    0.930476083289
6    0.848916349557
9    0.968211895611


In [10]:
data.head()

Unnamed: 0,1,2,3,4,6
0,9744375000.0,0.356487,526664100000.0,0.279271,0.3
1,6488627000.0,0.302243,123211300000.0,0.137124,0.3
2,1963719000.0,0.330547,458857200000.0,0.315956,0.3
3,9960097000.0,0.382041,145596300000.0,0.123606,0.3
4,6437510000.0,0.34852,864565200000.0,0.129599,0.3


In [11]:
to_predict = pd.DataFrame([[68.3e+09,0.3,379.3e+09,0.1,0.47]])
to_predict.columns = [1,2,3,4,6]

In [12]:
import xgboost as xgb
params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.3,
          "max_depth": 2,
          "subsample": 0.8,
          'gamma':1.0,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 1000

res = []
for i in [1,2,3,5,6,9]:
    dtrain = xgb.DMatrix(data, result[i])
    gbm = xgb.train(params, dtrain, num_boost_round)
    pred = gbm.predict(xgb.DMatrix(to_predict))
    print i," ",pred
    res.append(pred[0])

1   [  3.23474289e+10]
2   [  1.00508795e+10]
3   [  4.72337984e+08]
5   [  3.36007844e+10]
6   [  8.34120960e+08]
9   [  1.08749281e+10]


In [13]:
res

[3.2347429e+10,
 1.0050879e+10,
 4.7233798e+08,
 3.3600784e+10,
 8.3412096e+08,
 1.0874928e+10]

In [14]:
phi = 2*res[5]/res[0]
phi

0.67238284524696124

In [15]:
nu_c = (phi-1)/(phi-2)
nu_c

0.24677080555951506

In [16]:
E = 2*res[5]*(1+nu_c)
E/(1e+9)

27.117085805096778

In [17]:
rf = RandomForestRegressor(n_estimators = 200)
rf.fit(data, result[1])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [18]:
for i in [1,2,3,5,6,9]:
    rf.fit(data, result[i])
    print i," ",rf.predict(np.array([68.3e+09,0.3,379.3e+09,0.1,0.47]).reshape(1, -1))

1   [  3.54113595e+10]
2   [  1.14778940e+10]
3   [  4.41498610e+08]
5   [  3.57556855e+10]
6   [  7.84961440e+08]
9   [  1.07222615e+10]


In [19]:
(3.54919840e+10-2*1.07627915e+10)/1e+9

13.966401

In [20]:
data.describe()

Unnamed: 0,1,2,3,4,6
count,300.0,300.0,300.0,300.0,300.0
mean,5574183000.0,0.305267,498751100000.0,0.222166,0.4
std,2836223000.0,0.059083,265490000000.0,0.073148,0.081786
min,1078529000.0,0.201779,46105440000.0,0.10223,0.3
25%,2931881000.0,0.251675,293149300000.0,0.158937,0.3
50%,5475158000.0,0.312091,487237100000.0,0.216204,0.4
75%,8189232000.0,0.355053,691583900000.0,0.285937,0.5
max,9960097000.0,0.398134,973916500000.0,0.349943,0.5


In [21]:
68.3e+09,0.3,379.3e+09,0.1,0.47

(68300000000.0, 0.3, 379300000000.0, 0.1, 0.47)