In [1]:
import numpy as np
import pandas as pd
import os, datetime
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

In [4]:
model_name = 'ST3000DM001'
data_dirs = ['../data-2014/', '../data-2015/']
attributes = ['smart_1_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw']
drive_models = ['ST4000DM000', 'ST3000DM001', 'Hitachi HDS5C3030ALA630', 'Hitachi HDS722020ALA330', 'Hitachi HDS5C4040ALE630', 'HGST HMS5C4040ALE640', 'HGST HMS5C4040BLE640']
features = ['serial_number', 'date', 'capacity_bytes', 'smart_5_raw', 'smart_1_raw', 'smart_4_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw', 'smart_196_raw']
type_dict = {feature: np.float32 for feature in features[3:]}

def model_stat(model, X, y):
    f_score = cross_val_score(model, X, y, cv=5, scoring='f1', n_jobs=-1)
    r_score = cross_val_score(model, X, y, cv=5, scoring='recall', n_jobs=-1)
    p_score = cross_val_score(model, X, y, cv=5, scoring='precision', n_jobs=-1)
    return pd.Series({'P': np.mean(p_score), 'R': np.mean(r_score), 'F': np.mean(f_score), 'Sd': np.std(f_score)})

statics = pd.DataFrame(index=['CART', 'SVM', 'NN', 'LR', 'RF'], columns=['P', 'R', 'F', 'Sd'])
statics

Unnamed: 0,P,R,F,Sd
CART,,,,
SVM,,,,
NN,,,,
LR,,,,
RF,,,,


In [41]:
for model_name in drive_models:
    drive_model = pd.concat([(lambda pd: pd.loc[pd['model'] == model_name][features])(pd.read_csv(os.path.join(data_dir, filename), dtype=type_dict)) for data_dir in data_dirs for filename in os.listdir(data_dir)]).sort_values(by=['serial_number', 'date'])
    drive_model.to_csv('./preprocess/' + model_name + '.csv')

In [42]:
res_vars = ['smart_5_raw', 'smart_187_raw', 'smart_196_raw', 'smart_197_raw']
overview = pd.DataFrame(index=drive_models, columns=res_vars+['Capacity (TB)', '# Drives'])
type_dict = {key: np.float64 for key in res_vars}

for model_name in drive_models:
    drive_model = pd.read_csv('./preprocess/' + model_name + '.csv', dtype=type_dict)
    percentage = drive_model.groupby('serial_number')[res_vars].sum().agg(lambda data: data[data>0].size/data.size)
    percentage['Capacity (TB)'] = drive_model.iloc[0]['capacity_bytes'] // 1000**4
    percentage['# Drives'] = drive_model['serial_number'].unique().size
    overview.loc[model_name] = percentage
    
overview.to_csv('./preprocess/overview.csv')
overview

Unnamed: 0,smart_5_raw,smart_187_raw,smart_196_raw,smart_197_raw,Capacity (TB),# Drives
ST4000DM000,0.00966297,0.0183897,0.0,0.0199612,4,29908
ST3000DM001,0.240009,0.398025,0.0,0.325218,3,4354
Hitachi HDS5C3030ALA630,0.0366854,0.0,0.0358222,0.0205006,3,4634
Hitachi HDS722020ALA330,0.140384,0.0,0.139751,0.0542537,2,4737
Hitachi HDS5C4040ALE630,0.0193741,0.0,0.0197466,0.014158,4,2684
HGST HMS5C4040ALE640,0.00781359,0.0,0.00781359,0.00460444,4,7167
HGST HMS5C4040BLE640,0.00192616,0.0,0.00192616,0.00449438,4,3115


In [43]:
drive_model = pd.read_csv('./preprocess/' + model_name + '.csv', dtype=type_dict).iloc[:, 1:]

In [44]:
df = drive_model.sort_values(by=['serial_number', 'date'])
df['date'] = pd.to_datetime(df['date'])
df.head()

Unnamed: 0,serial_number,date,capacity_bytes,smart_5_raw,smart_1_raw,smart_4_raw,smart_7_raw,smart_9_raw,smart_12_raw,smart_187_raw,smart_193_raw,smart_194_raw,smart_197_raw,smart_199_raw,smart_196_raw
0,S1F007YW,2014-01-01,3000592982016,0.0,31238664.0,,,15694.0,,,,23.0,0.0,,
1,S1F007YW,2014-01-02,3000592982016,0.0,166023620.0,,,15718.0,,,,22.0,0.0,,
2,S1F007YW,2014-01-03,3000592982016,0.0,232808220.0,,,15742.0,,,,21.0,0.0,,
3,S1F007YW,2014-01-04,3000592982016,0.0,69066730.0,,,15766.0,,,,21.0,0.0,,
4,S1F007YW,2014-01-05,3000592982016,0.0,136538580.0,,,15790.0,,,,22.0,0.0,,


In [45]:
# week = ['W-SUN', 'W-MON', 'W-TUE', 'W-WED', 'W-THU', 'W-FRI', 'W-SAT']
df2 = df.copy()
df2['date'] += datetime.timedelta(days=1)

day = 'W-MON'
model_group = df.groupby(['serial_number', pd.Grouper(key='date', freq=day)])
y = model_group['smart_5_raw'].last().to_frame().rename({'smart_5_raw': 'y'}, axis='columns')

model_group2 = df2.groupby(['serial_number', pd.Grouper(key='date', freq=day)])
input1 = model_group2[attributes].last()

training_set = y.join(other=input1, how='inner')
training_set['y'] = np.where(training_set['y'] > training_set['smart_5_raw'], 1, 0)
input2_features = ['smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_197_raw', 'smart_199_raw']
input2 = training_set.groupby(level=0)[input2_features].transform(pd.DataFrame.diff).dropna(how='any')
training_set = training_set.join(other=input2, how='inner', rsuffix='_increase')
    
training_set.to_csv('./preprocess/' + model_name + '_training_set.csv')
training_set.shape

(155442, 21)

In [5]:
training_set = pd.read_csv('./preprocess/' + model_name + '_training_set.csv').drop(['serial_number', 'date'], axis='columns')
training_set.head()

Unnamed: 0,y,smart_1_raw,smart_4_raw,smart_5_raw,smart_7_raw,smart_9_raw,smart_12_raw,smart_187_raw,smart_193_raw,smart_194_raw,...,smart_199_raw,smart_4_raw_increase,smart_5_raw_increase,smart_7_raw_increase,smart_9_raw_increase,smart_12_raw_increase,smart_187_raw_increase,smart_193_raw_increase,smart_197_raw_increase,smart_199_raw_increase
0,0,198836320.0,43.0,0.0,33996100.0,16964.0,43.0,0.0,45110.0,23.0,...,0.0,0.0,0.0,9637436.0,168.0,0.0,0.0,0.0,0.0,0.0
1,0,148600700.0,43.0,0.0,44114000.0,17133.0,43.0,0.0,45110.0,22.0,...,0.0,0.0,0.0,10117900.0,169.0,0.0,0.0,0.0,0.0,0.0
2,0,122530840.0,43.0,0.0,52302384.0,17301.0,43.0,0.0,45110.0,23.0,...,0.0,0.0,0.0,8188384.0,168.0,0.0,0.0,0.0,0.0,0.0
3,0,60100630.0,46.0,0.0,53521120.0,17468.0,46.0,0.0,51014.0,21.0,...,0.0,3.0,0.0,1218736.0,167.0,3.0,0.0,5904.0,0.0,0.0
4,0,161701490.0,47.0,0.0,60648016.0,17635.0,47.0,0.0,52712.0,23.0,...,0.0,1.0,0.0,7126896.0,167.0,1.0,0.0,1698.0,0.0,0.0


In [44]:
training_set.columns

Index(['y', 'smart_1_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_raw',
       'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw',
       'smart_194_raw', 'smart_197_raw', 'smart_199_raw',
       'smart_4_raw_increase', 'smart_5_raw_increase', 'smart_7_raw_increase',
       'smart_9_raw_increase', 'smart_12_raw_increase',
       'smart_187_raw_increase', 'smart_193_raw_increase',
       'smart_197_raw_increase', 'smart_199_raw_increase'],
      dtype='object')

In [30]:
from sklearn.tree import _tree

def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print("{}return {}".format(indent, tree_.value[node]))

    recurse(0, 1)


In [31]:
# tune DT
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
tree_to_code(dt_model, training_set.drop(['y'], axis='columns').columns)

def tree(smart_1_raw, smart_4_raw, smart_5_raw, smart_7_raw, smart_9_raw, smart_12_raw, smart_187_raw, smart_193_raw, smart_194_raw, smart_197_raw, smart_199_raw, smart_4_raw_increase, smart_5_raw_increase, smart_7_raw_increase, smart_9_raw_increase, smart_12_raw_increase, smart_187_raw_increase, smart_193_raw_increase, smart_197_raw_increase, smart_199_raw_increase):
  if smart_7_raw <= 0.5141103267669678:
    return [[   0. 3799.]]
  else:  # if smart_7_raw > 0.5141103267669678
    return [[7590.    0.]]


In [None]:
training_set.drop(['y'], axis='columns').columns

In [32]:
# down sampling
X_health_raw = training_set.loc[training_set['y'] == 0].drop(['y', 'smart_7_raw', 'smart_9_raw'], axis='columns').values
X_fail = training_set.loc[training_set['y'] == 1].drop(['y', 'smart_7_raw', 'smart_9_raw'], axis='columns').values
n_clusters = X_fail.shape[0] // 5
# kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_jobs=-1).fit(X_health_raw)
kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=0, batch_size=50000).fit(X_health_raw)
X_health = np.concatenate([X_health_raw[np.argsort(kmeans.transform(X_health_raw)[:, j])[::-1][:10]] for j in range(0, n_clusters)], axis=0)
X_train_raw = np.concatenate((X_health, X_fail), axis=0)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_raw)
y_train = np.concatenate((np.zeros(X_health.shape[0]), np.ones(X_fail.shape[0])), axis=0) 

print(X_fail.shape)
print(X_train.shape)
print(y_train.shape)

(3799, 18)
(11389, 18)
(11389,)


In [33]:
# cart, rf (20-100 trees), svm, neural networks (3 layers 100 nodes), logistic regression
X_transformed = PCA(n_components=0.95).fit_transform(X_train)
X_transformed.shape

(11389, 8)

In [34]:
# tune LR
lr_model = LogisticRegressionCV(Cs=100, fit_intercept=True, cv=5, 
                                        dual=False, penalty='l2', scoring='f1', 
                                        solver='newton-cg',  max_iter=1000, class_weight='balanced',
                                        n_jobs=-1, refit=True, multi_class='ovr', random_state=0, verbose=1)
lr_model.fit(X_train, y_train)
statics.loc['LR'] = model_stat(lr_model, X_train, y_train)
statics.loc['LR']

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   10.4s remaining:   15.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   11.3s finished


P     1.000000
R     0.837327
F     0.910404
Sd    0.034052
Name: LR, dtype: float64

In [35]:
# tune LR
lr_model = LogisticRegressionCV(Cs=100, fit_intercept=True, cv=5, 
                                        dual=False, penalty='l2', scoring='f1', 
                                        solver='newton-cg',  max_iter=1000, class_weight='balanced',
                                        n_jobs=-1, refit=True, multi_class='ovr', random_state=0, verbose=1)
lr_model.fit(X_transformed, y_train)
statics.loc['LR'] = model_stat(lr_model, X_transformed, y_train)
statics.loc['LR']

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    3.0s remaining:    4.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.8s finished


P     0.574484
R     0.813889
F     0.672989
Sd    0.044650
Name: LR, dtype: float64

In [43]:
# print(np.sum(y))
# print(np.sum(y_train))
# training_set.loc[training_set['y'] == 1].drop(['y'], axis='columns').head()
rf_model.fit(X_train, y_train)
rf_model.feature_importances_

array([1.50848012e-02, 3.00575373e-02, 1.85648043e-01, 4.57764367e-03,
       1.80090308e-01, 5.25039308e-02, 1.84854078e-03, 2.22339059e-02,
       3.23804393e-02, 2.51713725e-07, 3.94904706e-02, 2.73948703e-01,
       2.19117058e-02, 3.49078540e-04, 6.55946181e-02, 7.40431769e-02,
       2.36845055e-04, 0.00000000e+00])

In [37]:
# tune SVM
svm_model = SVC(C=1.05, kernel='rbf', gamma=0.05, class_weight='balanced', max_iter=-1, random_state=0)
statics.loc['SVM'] = model_stat(svm_model, X_transformed, y_train)
statics.loc['SVM']

P     0.586992
R     0.853114
F     0.695316
Sd    0.022692
Name: SVM, dtype: float64

In [38]:
# tune RF
rf_model = RandomForestClassifier()
statics.loc['RF'] = model_stat(rf_model, X_transformed, y_train)
statics.loc['RF']

P     1.000000
R     0.999473
F     0.999868
Sd    0.000263
Name: RF, dtype: float64

In [39]:
# # rf_model.fit(X_, y_train)
# print(rf_model.feature_importances_)
# training_set.loc[training_set['y'] == 0].drop(['y'], axis='columns').columns

In [40]:
# tune DT
dt_model = DecisionTreeClassifier()
statics.loc['CART'] = model_stat(dt_model, X_transformed, y_train)
statics.loc['CART']

P     1.000000
R     0.997894
F     0.999341
Sd    0.000723
Name: CART, dtype: float64

In [41]:
# tune NN
nn_model = MLPClassifier(hidden_layer_sizes=(100, 100, 100))
statics.loc['NN'] = model_stat(nn_model, X_transformed, y_train)
statics.loc['NN']

P     1.000000
R     0.999473
F     0.999868
Sd    0.000263
Name: NN, dtype: float64

In [42]:
statics.T.to_csv('./preprocess/' + model_name + '_finalstatics.csv')
statics.T

Unnamed: 0,CART,SVM,NN,LR,RF
P,1.0,0.586992,1.0,0.574484,1.0
R,0.997894,0.853114,0.999473,0.813889,0.999473
F,0.999341,0.695316,0.999868,0.672989,0.999868
Sd,0.000723,0.022692,0.000263,0.04465,0.000263
