In [47]:
import utils, pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [48]:
df = pd.read_csv('mine.csv')

In [49]:
df.head()

Unnamed: 0,slno,pH,BDef,PCO2,BE
0,1,7.14,8.14,7.7,-10.5
1,2,7.0,7.92,12.0,-12.0
2,3,7.2,3.03,8.3,-5.6
3,4,7.3,5.19,5.5,-6.4
4,5,7.3,-4.52,5.7,-5.8


In [50]:
df.tail()

Unnamed: 0,slno,pH,BDef,PCO2,BE
547,548,7.33,-0.5,6.6,-0.8
548,549,7.08,10.92,7.9,-13.3
549,550,7.02,9.13,10.6,-12.3
550,551,7.03,8.91,10.4,-12.2
551,552,7.01,0.0,0.0,0.0


In [51]:
df.shape

(552, 5)

In [52]:
df.dtypes

slno      int64
pH      float64
BDef    float64
PCO2    float64
BE      float64
dtype: object

In [53]:
df.isnull().sum()

slno    0
pH      0
BDef    0
PCO2    0
BE      0
dtype: int64

In [54]:
df.drop(['slno'],axis=1)

Unnamed: 0,pH,BDef,PCO2,BE
0,7.14,8.14,7.7,-10.5
1,7.00,7.92,12.0,-12.0
2,7.20,3.03,8.3,-5.6
3,7.30,5.19,5.5,-6.4
4,7.30,-4.52,5.7,-5.8
5,7.23,1.29,8.2,-3.8
6,7.16,4.35,8.8,-6.8
7,7.36,3.88,4.9,-4.6
8,7.18,7.60,7.0,-9.6
9,7.35,5.20,4.7,-5.9


In [55]:
from sklearn.model_selection import train_test_split
x = df.drop('slno', axis=1)
y = df['slno']

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)
print(f'Dimension Data train : {x_train.shape}')
print(f'Dimension Data test : {x_val.shape}')

Dimension Data train : (441, 4)
Dimension Data test : (111, 4)


In [56]:
from sklearn.ensemble import RandomForestClassifier

m = RandomForestClassifier(n_estimators=10, n_jobs=-1)
m.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [57]:
def score():
    print(f'Scores:')
    print(f'Train      = {m.score(x_train, y_train):.4}')
    print(f'Validation = {m.score(x_val, y_val):.4}')
    
score()

Scores:
Train      = 0.9546
Validation = 0.0


In [58]:
x_train
y_train

388    389
516    517
210    211
15      16
336    337
39      40
54      55
163    164
46      47
93      94
192    193
341    342
228    229
237    238
286    287
551    552
408    409
367    368
158    159
319    320
481    482
332    333
184    185
400    401
69      70
209    210
310    311
280    281
312    313
124    125
      ... 
510    511
58      59
474    475
252    253
21      22
313    314
459    460
160    161
276    277
191    192
385    386
413    414
491    492
343    344
308    309
130    131
99     100
372    373
87      88
458    459
330    331
214    215
466    467
121    122
20      21
71      72
106    107
270    271
435    436
102    103
Name: slno, Length: 441, dtype: int64

In [59]:
preds = np.stack([t.predict(x_val) for t in m.estimators_])
print(preds.shape)
preds

(10, 111)


array([[192., 207., 231., ..., 347., 380., 285.],
       [192., 207., 173., ..., 342., 125., 285.],
       [192., 207., 428., ..., 369., 125., 376.],
       ...,
       [ 96., 299., 428., ..., 347., 125., 285.],
       [436., 183.,  78., ..., 370., 373.,  70.],
       [192., 387., 428., ..., 347., 373., 214.]])

In [60]:
sample = x_val.sample(1)

pred = np.stack([t.predict(sample) for t in m.estimators_])

pred.mean() > 0.5, pred.std()

(True, 145.42162837762476)

In [61]:
pred

array([[ 20.],
       [331.],
       [116.],
       [ 20.],
       [331.],
       [ 11.],
       [284.],
       [331.],
       [ 20.],
       [331.]])

In [62]:
m = RandomForestClassifier(n_estimators=40, n_jobs=-1, oob_score=True)
m.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [63]:
m.oob_score_

0.0

In [64]:
def score():
    print(f'Scores:')
    print(f'Train      = {m.score(x_train, y_train):.4}')
    print(f'Validation = {m.score(x_val, y_val):.4}')
    if hasattr(m, 'oob_score_'): print(f'OOB        = {m.oob_score_:.4}')
    
score()   

Scores:
Train      = 0.9773
Validation = 0.0
OOB        = 0.0


In [65]:
m.feature_importances_

array([0.19709159, 0.26990429, 0.22840072, 0.30460341])

In [68]:
imp = pd.DataFrame({'column_name':x_train.columns, 'values':m.feature_importances_}).sort_values('values', ascending=False)
imp.style.bar()

Unnamed: 0,column_name,values
3,BE,0.304603
1,BDef,0.269904
2,PCO2,0.228401
0,pH,0.197092


In [69]:
df = np.array([[600,1,2,40,3,60000,2,1,1,50000]])
df.shape

(1, 10)

In [70]:
y_pred = m.predict(x_val)
print("Predictions:\n {}".format(y_pred))

Predictions:
 [248 265 533 467 545 530 434 398 396  24  57 165   6 301 490  62 503 128
 487 100 381 517 320  26 131 418 504  47 418 137 359 533 504 114 220 181
 250 458 287 447  42  29 161 449 137 534 320  46 348 413 246  99 242 153
 410 215 356 323 276 320 418  92  22 340 427 238 381 145 261 218 124  92
 148 170 452 380 447  89 455 417   5 126 137 237 241 142 242 255  22 253
 100 356 506 422 145 405  75 386  97 304  46 181 202  97 494 433 521 485
 332 169 160]


In [71]:
np.mean(y_pred == y_val)
m.score(x_val, y_val)
m.oob_score_

0.0