In [60]:
import numpy as np
import pandas as pd
import os, datetime
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

In [107]:
data_dir = '../data-2015/'
attributes = ['smart_1_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw']
drive_models = ['ST4000DM000', 'ST3000DM001', 'Hitachi HDS5C3030ALA630', 'Hitachi HDS722020ALA330', 'Hitachi HDS5C4040ALE630', 'HGST HMS5C4040ALE640', 'HGST HMS5C4040BLE640']
features = ['serial_number', 'date', 'capacity_bytes', 'smart_5_raw', 'smart_1_raw', 'smart_4_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw', 'smart_196_raw']
type_dict = {feature: np.float32 for feature in features[3:]}

def model_stat(model, X, y):
    f_score = np.average(cross_val_score(model, X, y, cv=5, scoring='f1', n_jobs=-1))
    r_score = np.average(cross_val_score(model, X, y, cv=5, scoring='recall', n_jobs=-1))
    p_score = np.average(cross_val_score(model, X, y, cv=5, scoring='precision', n_jobs=-1))
    return pd.Series({'f1_score': str(f_score), 'recall': str(r_score), 'precision': str(p_score)})

In [97]:
for model_name in drive_models:
    drive_model = pd.concat([(lambda pd: pd.loc[pd['model'] == model_name][features])(pd.read_csv(os.path.join(data_dir, filename), dtype=type_dict)) for filename in os.listdir(data_dir)]).sort_values(by=['serial_number', 'date'])
    drive_model.to_csv('./preprocess/' + model_name + '.csv')

In [17]:
res_vars = ['smart_5_raw', 'smart_187_raw', 'smart_196_raw', 'smart_197_raw']
overview = pd.DataFrame(index=drive_models, columns=res_vars+['Capacity (TB)', '# Drives'])
type_dict = {key: np.float64 for key in res_vars}

for model_name in drive_models:
    drive_model = pd.read_csv('./preprocess/' + model_name + '.csv', dtype=type_dict)
    percentage = drive_model.groupby('serial_number')[res_vars].sum().agg(lambda data: data[data>0].size/data.size)
    percentage['Capacity (TB)'] = drive_model.iloc[0]['capacity_bytes'] // 1000**4
    percentage['# Drives'] = drive_model['serial_number'].unique().size
    overview.loc[model_name] = percentage
    
overview.to_csv('./preprocess/overview.csv')
overview

Unnamed: 0,smart_5_raw,smart_187_raw,smart_196_raw,smart_197_raw,Capacity (TB),# Drives
ST4000DM000,0.00738119,0.0154702,0.0,0.0150657,4,29670
ST3000DM001,0.120719,0.177226,0.0,0.0702055,3,1168
Hitachi HDS5C3030ALA630,0.0323491,0.0,0.032132,0.0123752,3,4606
Hitachi HDS722020ALA330,0.133675,0.0,0.133675,0.0365151,2,4683
Hitachi HDS5C4040ALE630,0.0150376,0.0,0.0154135,0.0075188,4,2660
HGST HMS5C4040ALE640,0.0060317,0.0,0.0060317,0.00266517,4,7129
HGST HMS5C4040BLE640,0.000966806,0.0,0.000966806,0.00257815,4,3103


In [3]:
model_name = 'ST3000DM001'
drive_model = pd.read_csv('./preprocess/' + model_name + '.csv', dtype=type_dict).iloc[:, 1:]

In [4]:
df = drive_model.sort_values(by=['serial_number', 'date'])
df['date'] = pd.to_datetime(df['date'])
df.head()

Unnamed: 0,serial_number,date,capacity_bytes,smart_5_raw,smart_1_raw,smart_4_raw,smart_7_raw,smart_9_raw,smart_12_raw,smart_187_raw,smart_193_raw,smart_194_raw,smart_197_raw,smart_199_raw,smart_196_raw
0,S1F01085,2015-01-01,3000592982016,56.0,215630672.0,74.0,994661632.0,24621.0,71.0,52.0,407438.0,19.0,0.0,7.0,
1,S1F01085,2015-01-02,3000592982016,56.0,1650864.0,74.0,995929536.0,24645.0,71.0,52.0,407438.0,19.0,0.0,7.0,
2,S1F01085,2015-01-03,3000592982016,56.0,124017368.0,74.0,997235904.0,24669.0,71.0,52.0,407438.0,19.0,0.0,7.0,
3,S1F01085,2015-01-04,3000592982016,56.0,128073224.0,74.0,998608128.0,24693.0,71.0,52.0,407439.0,19.0,0.0,7.0,
4,S1F01085,2015-01-05,3000592982016,56.0,97393448.0,74.0,999190720.0,24717.0,71.0,52.0,408114.0,18.0,0.0,7.0,


In [93]:
model_group = df.groupby(['serial_number', pd.Grouper(key='date', freq='W-MON')])
# y = model_group['smart_5_raw'].sum().apply(lambda x: 1 if x > 0 else 0).to_frame().rename({'smart_5_raw': 'y'}, axis='columns')
y = model_group['smart_5_raw'].last().to_frame().rename({'smart_5_raw': 'y'}, axis='columns')
y.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,y
serial_number,date,Unnamed: 2_level_1
S1F01085,2015-01-05,56.0
S1F01085,2015-01-12,56.0
S1F013BB,2015-05-11,0.0
S1F0166B,2015-01-05,0.0
S1F0166B,2015-01-12,0.0


In [94]:
df2 = df.copy()
df2['date'] += datetime.timedelta(days=1)
model_group2 = df2.groupby(['serial_number', pd.Grouper(key='date', freq='W-MON')])
input1 = model_group2[attributes].last()
input1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,smart_1_raw,smart_4_raw,smart_5_raw,smart_7_raw,smart_9_raw,smart_12_raw,smart_187_raw,smart_193_raw,smart_194_raw,smart_197_raw,smart_199_raw
serial_number,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
S1F01085,2015-01-05,128073224.0,74.0,56.0,998608100.0,24693.0,71.0,52.0,407439.0,19.0,0.0,7.0
S1F01085,2015-01-12,128832128.0,74.0,56.0,999259100.0,24742.0,71.0,52.0,409404.0,18.0,0.0,7.0
S1F013BB,2015-05-11,85127128.0,10.0,0.0,8720651000.0,20794.0,9.0,0.0,689062.0,22.0,0.0,0.0
S1F013BB,2015-05-18,115676688.0,10.0,0.0,8720652000.0,20818.0,9.0,0.0,689161.0,22.0,0.0,0.0
S1F0166B,2015-01-05,82195688.0,74.0,0.0,4588262.0,24696.0,71.0,0.0,403175.0,24.0,0.0,0.0


In [135]:
training_set = y.join(other=input1, how='inner')
training_set.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,y,smart_1_raw,smart_4_raw,smart_5_raw,smart_7_raw,smart_9_raw,smart_12_raw,smart_187_raw,smart_193_raw,smart_194_raw,smart_197_raw,smart_199_raw
serial_number,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
S1F01085,2015-01-05,56.0,128073224.0,74.0,56.0,998608100.0,24693.0,71.0,52.0,407439.0,19.0,0.0,7.0
S1F01085,2015-01-12,56.0,128832128.0,74.0,56.0,999259100.0,24742.0,71.0,52.0,409404.0,18.0,0.0,7.0
S1F013BB,2015-05-11,0.0,85127128.0,10.0,0.0,8720651000.0,20794.0,9.0,0.0,689062.0,22.0,0.0,0.0
S1F0166B,2015-01-05,0.0,82195688.0,74.0,0.0,4588262.0,24696.0,71.0,0.0,403175.0,24.0,0.0,0.0
S1F0166B,2015-01-12,0.0,7441792.0,74.0,0.0,5201943.0,24743.0,71.0,0.0,404786.0,22.0,0.0,0.0


In [136]:
training_set['y'] = np.where(training_set['y'] > training_set['smart_5_raw'], 1, 0)
training_set['y'].sum()

135

In [129]:
input2_features = ['smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_197_raw', 'smart_199_raw']
input2 = training_set.groupby(level=0)[input2_features].transform(pd.DataFrame.diff).dropna(how='any')
training_set = training_set.join(other=input2, how='inner', rsuffix='_increase')
training_set.to_csv('./preprocess/' + model_name + '_training_set.csv')
training_set.shape

(17637, 21)

In [150]:
# down sampling
X_health_raw = training_set.loc[training_set['y'] == 0].drop(['y'], axis='columns').values
X_fail = training_set.loc[training_set['y'] == 1].drop(['y'], axis='columns').values
print(X_fail.shape)
kmeans = KMeans(n_clusters=150, random_state=0, n_jobs=-1).fit(X_health_raw)
X_health = np.concatenate([X_health_raw[np.argsort(kmeans.transform(X_health_raw)[:, j])[::-1][:10]] for j in range(0, 50)], axis=0)
X_train_raw = np.concatenate((X_health, X_fail), axis=0)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_raw)
y_train = np.concatenate((np.zeros(X_health.shape[0]), np.ones(X_fail.shape[0])), axis=0) 

print(X_train.shape)
print(y_train.shape)

(135, 11)
(635, 11)
(635,)


In [151]:
# cart, rf (20-100 trees), svm, neural networks (3 layers 100 nodes), logistic regression
X_transformed = PCA(n_components=0.95).fit_transform(X_train)
X_transformed.shape

(635, 5)

In [152]:
statics = pd.DataFrame(index=['CART', 'SVM', 'NN', 'LR', 'RF'], columns=['f1_score', 'recall', 'precision'])
statics

Unnamed: 0,f1_score,recall,precision
CART,,,
SVM,,,
NN,,,
LR,,,
RF,,,


In [153]:
# tune LR
lr_model = LogisticRegressionCV(Cs=100, fit_intercept=True, cv=5, 
                                        dual=False, penalty='l2', scoring='f1', 
                                        solver='newton-cg',  max_iter=1000, class_weight='balanced',
                                        n_jobs=-1, refit=True, multi_class='ovr', random_state=0, verbose=1)
lr_model.fit(X_train, y_train)
statics.loc['LR'] = model_stat(lr_model, X_train, y_train)
statics.loc['LR']

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.7s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.6s finished


f1_score     0.8978890876565295
recall       0.9851851851851852
precision    0.8551617873651771
Name: LR, dtype: object

In [154]:
# tune LR
lr_model = LogisticRegressionCV(Cs=100, fit_intercept=True, cv=5, 
                                        dual=False, penalty='l2', scoring='f1', 
                                        solver='newton-cg',  max_iter=1000, class_weight='balanced',
                                        n_jobs=-1, refit=True, multi_class='ovr', random_state=0, verbose=1)
lr_model.fit(X_transformed, y_train)
statics.loc['LR'] = model_stat(lr_model, X_transformed, y_train)
statics.loc['LR']

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.5s remaining:    2.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.3s finished


f1_score      0.885173232083531
recall        0.962962962962963
precision    0.8551617873651771
Name: LR, dtype: object

In [155]:
# tune SVM
svm_model = SVC(C=1.05, kernel='rbf', gamma=0.05, class_weight='balanced', max_iter=-1, random_state=0)
statics.loc['SVM'] = model_stat(svm_model, X_transformed, y_train)
statics.loc['SVM']

f1_score     0.8643282687083159
recall       0.9259259259259259
precision    0.8551617873651771
Name: SVM, dtype: object

In [156]:
# tune RF
rf_model = RandomForestClassifier()
statics.loc['RF'] = model_stat(rf_model, X_transformed, y_train)
statics.loc['RF']

f1_score     0.8489834139745188
recall       0.8296296296296296
precision    0.8970315398886829
Name: RF, dtype: object

In [157]:
# tune DT
dt_model = DecisionTreeClassifier()
statics.loc['CART'] = model_stat(dt_model, X_transformed, y_train)
statics.loc['CART']

f1_score     0.8607239195419769
recall        0.837037037037037
precision     0.921951219512195
Name: CART, dtype: object

In [158]:
# tune NN
nn_model = MLPClassifier(hidden_layer_sizes=(100, 100, 100))
statics.loc['NN'] = model_stat(nn_model, X_transformed, y_train)
statics.loc['NN']

f1_score     0.7610262806215362
recall       0.7777777777777778
precision                 0.892
Name: NN, dtype: object

In [159]:
statics.T

Unnamed: 0,CART,SVM,NN,LR,RF
f1_score,0.8607239195419769,0.8643282687083159,0.7610262806215362,0.885173232083531,0.8489834139745188
recall,0.837037037037037,0.925925925925926,0.7777777777777778,0.962962962962963,0.8296296296296296
precision,0.921951219512195,0.8551617873651771,0.892,0.8551617873651771,0.8970315398886829
