In [1]:
import numpy as np
import pandas as pd
import os, datetime
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV, GridSearchCV, cross_validate
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import make_scorer
import matplotlib.pyplot as plt

In [2]:
model_name = 'ST3000DM001'
data_dirs = ['../data-2015/']
attributes = ['smart_1_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw']
drive_models = ['ST4000DM000', 'ST3000DM001', 'Hitachi HDS5C3030ALA630', 'Hitachi HDS722020ALA330', 'Hitachi HDS5C4040ALE630', 'HGST HMS5C4040ALE640', 'HGST HMS5C4040BLE640']
features = ['serial_number', 'date', 'capacity_bytes', 'smart_5_raw', 'smart_1_raw', 'smart_4_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw', 'smart_196_raw']
type_dict = {feature: np.float32 for feature in features[3:]}

def get_fpr(y_actual, y_hat):
    FP = 0
    tot = len(y_hat)
    for i in range(tot): 
        if y_hat[i]==1 and y_actual[i]!=y_hat[i]:
           FP += 1
    return FP / tot

def get_fnr(y_actual, y_hat):
    FN = 0
    tot = len(y_hat)
    for i in range(tot): 
        if y_hat[i]==0 and y_actual[i]!=y_hat[i]:
           FN += 1
    return FN / tot

FPR = make_scorer(get_fpr)
FNR = make_scorer(get_fnr)
scorer = {'FPR': FPR, 'FNR': FNR}
def model_stat(model, X, y):
    f_score = cross_val_score(model, X, y, cv=5, scoring='f1', n_jobs=-1)
    r_score = cross_val_score(model, X, y, cv=5, scoring='recall', n_jobs=-1)
    p_score = cross_val_score(model, X, y, cv=5, scoring='precision', n_jobs=-1)
    return pd.Series({'P': np.mean(p_score), 'R': np.mean(r_score), 'F': np.mean(f_score), 'Sd': np.std(f_score)})

statics = pd.DataFrame(index=['CART', 'SVM', 'NN', 'LR', 'RF'], columns=['P', 'R', 'F', 'Sd'])
statics

Unnamed: 0,P,R,F,Sd
CART,,,,
SVM,,,,
NN,,,,
LR,,,,
RF,,,,


In [4]:
for model_name in drive_models:
    drive_model = pd.concat([(lambda pd: pd.loc[pd['model'] == model_name][features])(pd.read_csv(os.path.join(data_dir, filename), dtype=type_dict)) for data_dir in data_dirs for filename in os.listdir(data_dir)]).sort_values(by=['serial_number', 'date'])
    drive_model.to_csv('./preprocess/' + model_name + '.csv')

KeyboardInterrupt: 

In [4]:
res_vars = ['smart_5_raw', 'smart_187_raw', 'smart_196_raw', 'smart_197_raw']
overview = pd.DataFrame(index=drive_models, columns=res_vars+['Capacity (TB)', '# Drives'])
type_dict = {key: np.float64 for key in res_vars}

for model_name in drive_models:
    drive_model = pd.read_csv('./preprocess/' + model_name + '.csv', dtype=type_dict)
    percentage = drive_model.groupby('serial_number')[res_vars].sum().agg(lambda data: data[data>0].size/data.size)
    percentage['Capacity (TB)'] = drive_model.iloc[0]['capacity_bytes'] // 1000**4
    percentage['# Drives'] = drive_model['serial_number'].unique().size
    overview.loc[model_name] = percentage
    
overview.to_csv('./preprocess/overview.csv')
overview

Unnamed: 0,smart_5_raw,smart_187_raw,smart_196_raw,smart_197_raw,Capacity (TB),# Drives
ST4000DM000,0.00738119,0.0154702,0.0,0.0150657,4,29670
ST3000DM001,0.120719,0.177226,0.0,0.0702055,3,1168
Hitachi HDS5C3030ALA630,0.0323491,0.0,0.032132,0.0123752,3,4606
Hitachi HDS722020ALA330,0.133675,0.0,0.133675,0.0365151,2,4683
Hitachi HDS5C4040ALE630,0.0150376,0.0,0.0154135,0.0075188,4,2660
HGST HMS5C4040ALE640,0.0060317,0.0,0.0060317,0.00266517,4,7129
HGST HMS5C4040BLE640,0.000966806,0.0,0.000966806,0.00257815,4,3103


In [64]:
model_name = 'HGST HMS5C4040BLE640'
drive_model = pd.read_csv('./preprocess/' + model_name + '.csv', dtype=type_dict).iloc[:, 1:]

In [65]:
df = drive_model.sort_values(by=['serial_number', 'date']).dropna(how='all', axis='columns')
df['date'] = pd.to_datetime(df['date'])
df.head()

Unnamed: 0,serial_number,date,capacity_bytes,smart_5_raw,smart_1_raw,smart_4_raw,smart_7_raw,smart_9_raw,smart_12_raw,smart_193_raw,smart_194_raw,smart_197_raw,smart_199_raw,smart_196_raw
0,PL1331LAGGBEBH,2015-01-01,4000787030016,0.0,0.0,7.0,0.0,5677.0,7.0,164.0,30.0,0.0,0.0,0.0
1,PL1331LAGGBEBH,2015-01-02,4000787030016,0.0,0.0,7.0,0.0,5700.0,7.0,164.0,30.0,0.0,0.0,0.0
2,PL1331LAGGBEBH,2015-01-03,4000787030016,0.0,0.0,7.0,0.0,5724.0,7.0,164.0,30.0,0.0,0.0,0.0
3,PL1331LAGGBEBH,2015-01-04,4000787030016,0.0,0.0,7.0,0.0,5749.0,7.0,164.0,30.0,0.0,0.0,0.0
4,PL1331LAGGBEBH,2015-01-05,4000787030016,0.0,0.0,7.0,0.0,5772.0,7.0,164.0,30.0,0.0,0.0,0.0


In [66]:
# week = ['W-SUN', 'W-MON', 'W-TUE', 'W-WED', 'W-THU', 'W-FRI', 'W-SAT']
df2 = df.copy()
df2['date'] += datetime.timedelta(days=1)

day = 'W-MON'
model_group = df.groupby(['serial_number', pd.Grouper(key='date', freq=day)])
y = model_group['smart_5_raw'].last().to_frame().rename({'smart_5_raw': 'y'}, axis='columns')

model_group2 = df2.groupby(['serial_number', pd.Grouper(key='date', freq=day)])
input1 = model_group2[[i for i in attributes if i in df2.columns]].last()

training_set = y.join(other=input1, how='inner')
training_set['y'] = np.where(training_set['y'] > training_set['smart_5_raw'], 1, 0)
input2_features = ['smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_197_raw', 'smart_199_raw']
input2 = training_set.groupby(level=0)[[i for i in input2_features if i in training_set.columns]].transform(pd.DataFrame.diff).dropna(how='any')
training_set = training_set.join(other=input2, how='inner', rsuffix='_increase')
    
training_set.to_csv('./preprocess/' + model_name + '_training_set.csv')
training_set.shape

(160979, 19)

In [67]:
training_set = pd.read_csv('./preprocess/' + model_name + '_training_set.csv').drop(['serial_number', 'date'], axis='columns')
training_set.shape

(160979, 19)

In [68]:
training_set.columns

Index(['y', 'smart_1_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_raw',
       'smart_9_raw', 'smart_12_raw', 'smart_193_raw', 'smart_194_raw',
       'smart_197_raw', 'smart_199_raw', 'smart_4_raw_increase',
       'smart_5_raw_increase', 'smart_7_raw_increase', 'smart_9_raw_increase',
       'smart_12_raw_increase', 'smart_193_raw_increase',
       'smart_197_raw_increase', 'smart_199_raw_increase'],
      dtype='object')

In [70]:
# down sampling
X_health_raw = training_set.loc[training_set['y'] == 0].drop(['y'], axis='columns').values
X_fail = training_set.loc[training_set['y'] == 1].drop(['y'], axis='columns').values
print(X_fail.shape)
n_clusters = X_fail.shape[0] // 5
# kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_jobs=-1).fit(X_health_raw)
kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=0, batch_size=50000).fit(X_health_raw)
X_health = np.concatenate([X_health_raw[np.argsort(kmeans.transform(X_health_raw)[:, j])[::-1][:15]] for j in range(0, n_clusters)], axis=0)
X_train_raw = np.concatenate((X_health, X_fail), axis=0)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_raw)
y_train = np.concatenate((np.zeros(X_health.shape[0]), np.ones(X_fail.shape[0])), axis=0) 
print(X_train.shape)
print(y_train.shape)

(1, 18)


  n_local_trials = 2 + int(np.log(n_clusters))


OverflowError: cannot convert float infinity to integer

In [None]:
# cart, rf (20-100 trees), svm, neural networks (3 layers 100 nodes), logistic regression
X_transformed = PCA(n_components=0.95).fit_transform(X_train)
X_transformed.shape

In [None]:
# tune LR
lr_model = LogisticRegressionCV(Cs=100, fit_intercept=True, cv=5, 
                                        dual=False, penalty='l2', scoring='f1', 
                                        solver='newton-cg',  max_iter=1000, class_weight='balanced',
                                        n_jobs=-1, refit=True, multi_class='ovr', random_state=0, verbose=1)
lr_model.fit(X_transformed, y_train)

statics.loc['LR'] = model_stat(lr_model, X_transformed, y_train)
statics.loc['LR']

In [None]:
# tune SVM
search_grid = {
    'C': [15], 
    'kernel': ['rbf'],
    'gamma': [0.38],
    'class_weight': ['balanced'],
    'max_iter': [-1],
    'random_state': [0]
}
svm_model = SVC()
svm_grid = GridSearchCV(estimator=svm_model, param_grid=search_grid, 
    cv=5, scoring='f1', n_jobs=-1, verbose=2)
svm_grid.fit(X_transformed, y_train)

print(svm_grid.best_score_)
print(svm_grid.best_params_)
statics.loc['SVM'] = model_stat(svm_grid.best_estimator_, X_transformed, y_train)
statics.loc['SVM']

In [None]:
# tune RF
# tune RF
n_estimators = range(1, 20) # tuned
max_features = ['auto', 'log2', 'sqrt', None] # tuned
criterion = ['entropy']  # tuned
max_depth = [5] # tuned
min_samples_split = [2]
min_samples_leaf = [1]
min_weight_fraction_leaf = [0]
max_leaf_nodes = [None]

bootstrap = [False]
random_state = [0]
class_weight = ['balanced']
search_grid = {
    'n_estimators': n_estimators,
    'criterion': criterion,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'min_weight_fraction_leaf': min_weight_fraction_leaf,
    'max_leaf_nodes': max_leaf_nodes,
    'bootstrap': bootstrap,
    'random_state': random_state,
    'class_weight': class_weight
    }

rf_model = RandomForestClassifier()
rf_grid = GridSearchCV(estimator=rf_model, param_grid=search_grid, 
    cv=5, scoring='f1', n_jobs=-1, verbose=2)

rf_grid.fit(X_transformed, y_train)
print(rf_grid.best_score_)
print(rf_grid.best_params_)

statics.loc['RF'] = model_stat(rf_grid.best_estimator_, X_transformed, y_train)
print(statics.loc['RF'])

# Start
forest = rf_grid.best_estimator_
forest.fit(X_train, y_train)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
# plt.xticks(df.columns.values, indices)
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, X_train.shape[1]])
plt.show()

# End

In [None]:
# tune DT (TBI)
search_grid = {
    'criterion': ['entropy', 'gini'],
    'min_samples_split': [2],
    'min_samples_leaf': [2],
    'max_depth': [2,3,4,8,11],
    'max_features':[None],
    'class_weight': ['balanced'],
    'random_state': [0]
}
dt_model = DecisionTreeClassifier()
dt_grid = GridSearchCV(estimator= dt_model, param_grid=search_grid, 
    cv=5, scoring='f1', n_jobs=3, verbose=2)
dt_grid.fit(X_transformed, y_train)
print(dt_grid.best_score_)
print(dt_grid.best_params_)

statics.loc['CART'] = model_stat(dt_grid.best_estimator_, X_transformed, y_train)
statics.loc['CART']

In [None]:
# tune NN
nn_model = MLPClassifier(hidden_layer_sizes=(100, 100, 100))

statics.loc['NN'] = model_stat(nn_model, X_transformed, y_train)
statics.loc['NN']

In [None]:
statics.T.to_csv('./preprocess/' + model_name + '_finalstatics.csv')
statics.T 