In [2]:
import numpy as np
import pandas as pd
import os, datetime
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
model_name = 'ST3000DM001'
data_dirs = ['../data-2015/']
attributes = ['smart_1_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw']
drive_models = ['ST4000DM000', 'ST3000DM001', 'Hitachi HDS5C3030ALA630', 'Hitachi HDS722020ALA330', 'Hitachi HDS5C4040ALE630', 'HGST HMS5C4040ALE640', 'HGST HMS5C4040BLE640']
features = ['serial_number', 'date', 'capacity_bytes', 'smart_5_raw', 'smart_1_raw', 'smart_4_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw', 'smart_196_raw']
type_dict = {feature: np.float32 for feature in features[3:]}

def model_stat(model, X, y):
    f_score = cross_val_score(model, X, y, cv=5, scoring='f1', n_jobs=-1)
    r_score = cross_val_score(model, X, y, cv=5, scoring='recall', n_jobs=-1)
    p_score = cross_val_score(model, X, y, cv=5, scoring='precision', n_jobs=-1)
    return pd.Series({'P': np.mean(p_score), 'R': np.mean(r_score), 'F': np.mean(f_score), 'Sd': np.std(f_score)})

statics = pd.DataFrame(index=['CART', 'SVM', 'NN', 'LR', 'RF'], columns=['P', 'R', 'F', 'Sd'])
statics

Unnamed: 0,P,R,F,Sd
CART,,,,
SVM,,,,
NN,,,,
LR,,,,
RF,,,,


In [None]:
for model_name in drive_models:
    drive_model = pd.concat([(lambda pd: pd.loc[pd['model'] == model_name][features])(pd.read_csv(os.path.join(data_dir, filename), dtype=type_dict)) for data_dir in data_dirs for filename in os.listdir(data_dir)]).sort_values(by=['serial_number', 'date'])
    drive_model.to_csv('./preprocess/' + model_name + '.csv')

In [None]:
res_vars = ['smart_5_raw', 'smart_187_raw', 'smart_196_raw', 'smart_197_raw']
overview = pd.DataFrame(index=drive_models, columns=res_vars+['Capacity (TB)', '# Drives'])
type_dict = {key: np.float64 for key in res_vars}

for model_name in drive_models:
    drive_model = pd.read_csv('./preprocess/' + model_name + '.csv', dtype=type_dict)
    percentage = drive_model.groupby('serial_number')[res_vars].sum().agg(lambda data: data[data>0].size/data.size)
    percentage['Capacity (TB)'] = drive_model.iloc[0]['capacity_bytes'] // 1000**4
    percentage['# Drives'] = drive_model['serial_number'].unique().size
    overview.loc[model_name] = percentage
    
overview.to_csv('./preprocess/overview.csv')
overview

In [None]:
drive_model = pd.read_csv('./preprocess/' + model_name + '.csv', dtype=type_dict).iloc[:, 1:]

In [None]:
df = drive_model.sort_values(by=['serial_number', 'date'])
df['date'] = pd.to_datetime(df['date'])
df.head()

In [None]:
# week = ['W-SUN', 'W-MON', 'W-TUE', 'W-WED', 'W-THU', 'W-FRI', 'W-SAT']
df2 = df.copy()
df2['date'] += datetime.timedelta(days=1)

day = 'W-MON'
model_group = df.groupby(['serial_number', pd.Grouper(key='date', freq=day)])
y = model_group['smart_5_raw'].last().to_frame().rename({'smart_5_raw': 'y'}, axis='columns')

model_group2 = df2.groupby(['serial_number', pd.Grouper(key='date', freq=day)])
input1 = model_group2[attributes].last()

training_set = y.join(other=input1, how='inner')
training_set['y'] = np.where(training_set['y'] > training_set['smart_5_raw'], 1, 0)
input2_features = ['smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_197_raw', 'smart_199_raw']
input2 = training_set.groupby(level=0)[input2_features].transform(pd.DataFrame.diff).dropna(how='any')
training_set = training_set.join(other=input2, how='inner', rsuffix='_increase')
    
training_set.to_csv('./preprocess/' + model_name + '_training_set.csv')
training_set.shape

In [None]:
training_set = pd.read_csv('./preprocess/' + model_name + '_training_set.csv').drop(['serial_number', 'date'], axis='columns')
training_set.head()

In [None]:
training_set.columns

In [None]:
from sklearn.tree import _tree

def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print("{}return {}".format(indent, tree_.value[node]))

    recurse(0, 1)


In [None]:
# tune DT
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
tree_to_code(dt_model, training_set.drop(['y'], axis='columns').columns)

In [None]:
training_set.drop(['y'], axis='columns').columns

In [None]:
# down sampling
X_health_raw = training_set.loc[training_set['y'] == 0].drop(['y'], axis='columns').values
X_fail = training_set.loc[training_set['y'] == 1].drop(['y'], axis='columns').values
n_clusters = X_fail.shape[0] // 5
# kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_jobs=-1).fit(X_health_raw)
kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=0, batch_size=50000).fit(X_health_raw)
X_health = np.concatenate([X_health_raw[np.argsort(kmeans.transform(X_health_raw)[:, j])[::-1][:10]] for j in range(0, n_clusters)], axis=0)
X_train_raw = np.concatenate((X_health, X_fail), axis=0)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_raw)
y_train = np.concatenate((np.zeros(X_health.shape[0]), np.ones(X_fail.shape[0])), axis=0) 

print(X_fail.shape)
print(X_train.shape)
print(y_train.shape)

In [None]:
# cart, rf (20-100 trees), svm, neural networks (3 layers 100 nodes), logistic regression
X_transformed = PCA(n_components=0.95).fit_transform(X_train)
X_transformed.shape

In [None]:
# tune LR
lr_model = LogisticRegressionCV(Cs=100, fit_intercept=True, cv=5, 
                                        dual=False, penalty='l2', scoring='f1', 
                                        solver='newton-cg',  max_iter=1000, class_weight='balanced',
                                        n_jobs=-1, refit=True, multi_class='ovr', random_state=0, verbose=1)
lr_model.fit(X_train, y_train)
statics.loc['LR'] = model_stat(lr_model, X_train, y_train)
statics.loc['LR']

In [None]:
# tune LR
lr_model = LogisticRegressionCV(Cs=100, fit_intercept=True, cv=5, 
                                        dual=False, penalty='l2', scoring='f1', 
                                        solver='newton-cg',  max_iter=1000, class_weight='balanced',
                                        n_jobs=-1, refit=True, multi_class='ovr', random_state=0, verbose=1)
lr_model.fit(X_transformed, y_train)
statics.loc['LR'] = model_stat(lr_model, X_transformed, y_train)
statics.loc['LR']

In [None]:
# print(np.sum(y))
# print(np.sum(y_train))
# training_set.loc[training_set['y'] == 1].drop(['y'], axis='columns').head()
rf_model.fit(X_train, y_train)
rf_model.feature_importances_

In [None]:
# tune SVM
svm_model = SVC(C=1.05, kernel='rbf', gamma=0.05, class_weight='balanced', max_iter=-1, random_state=0)
statics.loc['SVM'] = model_stat(svm_model, X_transformed, y_train)
statics.loc['SVM']

In [None]:
# tune RF
rf_model = RandomForestClassifier()
statics.loc['RF'] = model_stat(rf_model, X_transformed, y_train)
statics.loc['RF']

In [None]:
# # rf_model.fit(X_, y_train)
# print(rf_model.feature_importances_)
# training_set.loc[training_set['y'] == 0].drop(['y'], axis='columns').columns

In [None]:
# tune DT
dt_model = DecisionTreeClassifier()
statics.loc['CART'] = model_stat(dt_model, X_transformed, y_train)
statics.loc['CART']

In [None]:
# tune NN
nn_model = MLPClassifier(hidden_layer_sizes=(100, 100, 100))
statics.loc['NN'] = model_stat(nn_model, X_transformed, y_train)
statics.loc['NN']

In [None]:
statics.T.to_csv('./preprocess/' + model_name + '_finalstatics.csv')
statics.T