In [1]:
import datetime
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error

%matplotlib inline

In [2]:
train = pd.read_csv('../data/processed/train.csv')
print('shape: ', train.shape)
print('columns', train.columns)

shape:  (20631, 68)
columns Index(['id', 'cycle', 'setting1', 'setting2', 's1', 's2', 's3', 's4', 's5',
       's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14', 's15', 's16',
       's17', 's20', 's21', 'RUL', 'label1', 'label2', 'avg_setting1',
       'std_setting1', 'avg_setting2', 'std_setting2', 'avg_s1', 'std_s1',
       'avg_s2', 'std_s2', 'avg_s3', 'std_s3', 'avg_s4', 'std_s4', 'avg_s5',
       'std_s5', 'avg_s6', 'std_s6', 'avg_s7', 'std_s7', 'avg_s8', 'std_s8',
       'avg_s9', 'std_s9', 'avg_s10', 'std_s10', 'avg_s11', 'std_s11',
       'avg_s12', 'std_s12', 'avg_s13', 'std_s13', 'avg_s14', 'std_s14',
       'avg_s15', 'std_s15', 'avg_s16', 'std_s16', 'avg_s17', 'std_s17',
       'avg_s20', 'std_s20', 'avg_s21', 'std_s21'],
      dtype='object')


In [3]:
test = pd.read_csv('../data/processed/test.csv')
print('shape: ', test.shape)
print('columns', test.columns)

shape:  (13096, 68)
columns Index(['id', 'cycle', 'setting1', 'setting2', 's1', 's2', 's3', 's4', 's5',
       's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14', 's15', 's16',
       's17', 's20', 's21', 'RUL', 'label1', 'label2', 'avg_setting1',
       'std_setting1', 'avg_setting2', 'std_setting2', 'avg_s1', 'std_s1',
       'avg_s2', 'std_s2', 'avg_s3', 'std_s3', 'avg_s4', 'std_s4', 'avg_s5',
       'std_s5', 'avg_s6', 'std_s6', 'avg_s7', 'std_s7', 'avg_s8', 'std_s8',
       'avg_s9', 'std_s9', 'avg_s10', 'std_s10', 'avg_s11', 'std_s11',
       'avg_s12', 'std_s12', 'avg_s13', 'std_s13', 'avg_s14', 'std_s14',
       'avg_s15', 'std_s15', 'avg_s16', 'std_s16', 'avg_s17', 'std_s17',
       'avg_s20', 'std_s20', 'avg_s21', 'std_s21'],
      dtype='object')


In [4]:
# Select feature columns
features = test.columns
features = features[(features != 'id') & (features != 'RUL') & (features != 'label1') & (features != 'label2')]

In [5]:
X = train[features].values
y = train['RUL'].values

# normalize data (we haven't ensured that MinMaxScaler is the best scaler for our data)
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

### Regression

In [6]:
def compare_y_and_y_pred(y_true, y_pred):
    d = {'y_true' : y_true,
         'y_pred' : y_pred }
    print(pd.DataFrame(data=d))

In [7]:
def train_and_evaluate(models):
    for model_name, model in models:
        print("\n*****\nModel: ", model_name)
        model.fit(X, y)
        y_pred = model.predict(X)
        print('Mean Absolute Error: ', mean_absolute_error(y, y_pred))
        # cross validation
        scores = -cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=10)
        print("Mean Absolute Error (CV=10): ", scores.mean()) 

#### Decision Tree Regression

In [8]:
models = [
    ('dummy-mean', DummyRegressor(strategy='mean')),
    ('dummy-median', DummyRegressor(strategy='median')),
    ('DT. max_depth=4', DecisionTreeRegressor(max_depth=4, random_state=2017)),
    ('DT. max_depth=8', DecisionTreeRegressor(max_depth=8, random_state=2017)),
    ('DT. max_depth=16', DecisionTreeRegressor(max_depth=16, random_state=2017)),
    ('DT. max_depth=32', DecisionTreeRegressor(max_depth=32, random_state=2017))
]

train_and_evaluate(models)


*****
Model:  dummy-mean
Mean Absolute Error:  56.8586020181
Mean Absolute Error (CV=10):  56.9523873899

*****
Model:  dummy-median
Mean Absolute Error:  56.7297271097
Mean Absolute Error (CV=10):  56.7867027068

*****
Model:  DT. max_depth=4
Mean Absolute Error:  26.7831861643
Mean Absolute Error (CV=10):  28.5751974896

*****
Model:  DT. max_depth=8
Mean Absolute Error:  21.8840605891
Mean Absolute Error (CV=10):  29.6262965844

*****
Model:  DT. max_depth=16
Mean Absolute Error:  4.86359142809
Mean Absolute Error (CV=10):  34.742304956

*****
Model:  DT. max_depth=32
Mean Absolute Error:  6.46276638715e-05
Mean Absolute Error (CV=10):  35.7238144272


#### Random Forest Regression

In [9]:
models = [
    ('RF, trees=8, features=8', RandomForestRegressor(n_estimators = 8, max_features = 8, 
                                                      max_depth = 32, random_state=2017)),
    ('RF, trees=32, features=8', RandomForestRegressor(n_estimators = 32, max_features = 8, 
                                                       max_depth = 32, random_state=2017)),
    ('RF, trees=8, features=32', RandomForestRegressor(n_estimators = 8, max_features = 32, 
                                                       max_depth = 32, random_state=2017)),
    ('RF, trees=32, features=32', RandomForestRegressor(n_estimators = 32, max_features = 32, 
                                                        max_depth = 32, random_state=2017))
]

train_and_evaluate(models)


*****
Model:  RF, trees=8, features=8
Mean Absolute Error:  9.28461289631
Mean Absolute Error (CV=10):  28.5180629068

*****
Model:  RF, trees=32, features=8
Mean Absolute Error:  8.1428753303
Mean Absolute Error (CV=10):  27.0698155674

*****
Model:  RF, trees=8, features=32
Mean Absolute Error:  7.91646351964
Mean Absolute Error (CV=10):  28.1311452202

*****
Model:  RF, trees=32, features=32
Mean Absolute Error:  6.92853420688
Mean Absolute Error (CV=10):  27.0315575822


#### Boosted Trees Regression