In [21]:
import numpy as np
import pandas as pd
import os, datetime
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV, GridSearchCV, cross_validate, cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import make_scorer
import pandas as pd
import six
model_name = 'ST3000DM001'

In [7]:
pd.read_csv('./preprocess/overview.csv').rename({'Unnamed: 0': 'Models'}, axis='columns')

Unnamed: 0,Models,smart_5_raw,smart_187_raw,smart_196_raw,smart_197_raw,Capacity (TB),# Drives
0,ST4000DM000,0.007381,0.01547,0.0,0.015066,4.0,29670.0
1,ST3000DM001,0.120719,0.177226,0.0,0.070205,3.0,1168.0
2,Hitachi HDS5C3030ALA630,0.032349,0.0,0.032132,0.012375,3.0,4606.0
3,Hitachi HDS722020ALA330,0.133675,0.0,0.133675,0.036515,2.0,4683.0
4,Hitachi HDS5C4040ALE630,0.015038,0.0,0.015414,0.007519,4.0,2660.0
5,HGST HMS5C4040ALE640,0.006032,0.0,0.006032,0.002665,4.0,7129.0
6,HGST HMS5C4040BLE640,0.000967,0.0,0.000967,0.002578,4.0,3103.0


In [8]:
pd.read_csv('./preprocess/' + model_name + '_finalstatics.csv').rename({'Unnamed: 0': 'Matrics'}, axis='columns')

Unnamed: 0,Matrics,CART,SVM,NN,LR,RF
0,P,0.990476,0.834255,0.992,0.860375,0.992
1,R,0.871014,0.757971,0.888043,0.886957,0.90471
2,F,0.926405,0.754298,0.959316,0.849859,0.963351
3,Sd,0.030712,0.115433,0.027444,0.080583,0.032637


In [5]:
overview = pd.read_csv('./preprocess/overview.csv')
overview.iloc[:, 0]

0                ST4000DM000
1                ST3000DM001
2    Hitachi HDS5C3030ALA630
3    Hitachi HDS722020ALA330
4    Hitachi HDS5C4040ALE630
5       HGST HMS5C4040ALE640
6       HGST HMS5C4040BLE640
Name: Unnamed: 0, dtype: object

In [23]:
def render_mpl_table(data, col_width=3.0, row_height=0.625, font_size=14,
                     header_color='#40466e', row_colors=['#f1f1f2', 'w'], edge_color='w',
                     bbox=[0, 0, 1, 1], header_columns=0,
                     ax=None, **kwargs):
    if ax is None:
        size = (np.array(data.shape[::-1]) + np.array([0, 1])) * np.array([col_width, row_height])
        fig, ax = plt.subplots(figsize=size)
        ax.axis('off')

    mpl_table = ax.table(cellText=data.values, bbox=bbox, colLabels=data.columns, **kwargs)

    mpl_table.auto_set_font_size(False)
    mpl_table.set_fontsize(font_size)

    for k, cell in  six.iteritems(mpl_table._cells):
        cell.set_edgecolor(edge_color)
        if k[0] == 0 or k[1] < header_columns:
            cell.set_text_props(weight='bold', color='w')
            cell.set_facecolor(header_color)
        else:
            cell.set_facecolor(row_colors[k[0]%len(row_colors) ])
    return ax
models = overview.iloc[:, 0]
model_list = []
for model_name in models:
    try:
        df = pd.read_csv('./preprocess/' + model_name +'_finalstatics.csv').rename({'Unnamed: 0': 'Metrics'}, axis='columns')
        df['model'] = model_name
        model_list.append(df)
    except:
        pass
df = pd.concat(model_list, axis=0)
df.set_index(['model', 'Metrics']).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,CART,SVM,NN,LR,RF
model,Metrics,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ST4000DM000,P,1.0,1.0,1.0,1.0,1.0
ST4000DM000,R,1.0,1.0,1.0,1.0,1.0
ST4000DM000,F,1.0,1.0,1.0,1.0,1.0
ST4000DM000,Sd,0.0,0.0,0.0,0.0,0.0
ST3000DM001,P,0.97,0.99,0.99,0.9,0.98
ST3000DM001,R,0.87,0.91,0.93,0.94,0.97
ST3000DM001,F,0.91,0.95,0.94,0.92,0.98
ST3000DM001,Sd,0.09,0.02,0.05,0.05,0.01
Hitachi HDS5C3030ALA630,P,1.0,1.0,1.0,1.0,1.0
Hitachi HDS5C3030ALA630,R,0.97,0.97,0.94,0.94,1.0


In [16]:
df000 = pd.DataFrame(columns=['# total', '# failure', '# failure percentage'], index=models)
data_dirs = ['../data-2015/']
attributes = ['smart_1_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw']
drive_models = ['ST4000DM000', 'ST3000DM001', 'Hitachi HDS5C3030ALA630', 'Hitachi HDS722020ALA330', 'Hitachi HDS5C4040ALE630', 'HGST HMS5C4040ALE640']
features = ['serial_number', 'date', 'capacity_bytes', 'smart_5_raw', 'smart_1_raw', 'smart_4_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw', 'smart_196_raw']
type_dict = {feature: np.float32 for feature in features[3:]}
for model_name in models: 
    drive_model = pd.read_csv('./preprocess/' + model_name + '.csv', dtype=type_dict).iloc[:, 1:]
    df = drive_model.sort_values(by=['serial_number', 'date']).dropna(how='all', axis='columns')
    df['date'] = pd.to_datetime(df['date'])
    df2 = df.copy()
    df2['date'] += datetime.timedelta(days=1)
    day = 'W-MON'
    model_group = df.groupby(['serial_number', pd.Grouper(key='date', freq=day)])
    y = model_group['smart_5_raw'].last().to_frame().rename({'smart_5_raw': 'y'}, axis='columns')
    model_group2 = df2.groupby(['serial_number', pd.Grouper(key='date', freq=day)])
    input1 = model_group2[[i for i in attributes if i in df2.columns]].last()

    training_set = y.join(other=input1, how='inner')
    training_set['y'] = np.where(training_set['y'] > training_set['smart_5_raw'], 1, 0)
    input2_features = ['smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_197_raw', 'smart_199_raw']
    input2 = training_set.groupby(level=0)[[i for i in input2_features if i in training_set.columns]].transform(pd.DataFrame.diff).dropna(how='any')
    training_set = training_set.join(other=input2, how='inner', rsuffix='_increase')

    X_fail = training_set.loc[training_set['y'] == 1].drop(['y'], axis='columns').values

    df000.loc[model_name]['# total'] = training_set.shape[0]
    df000.loc[model_name]['# failure'] = X_fail.shape[0]
    df000.loc[model_name]['# failure percentage'] = np.round(df000.loc[model_name]['# failure'] / df000.loc[model_name]['# total'], decimals=2)
df000

Unnamed: 0_level_0,# total,# failure,# failure percentage
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ST4000DM000,970627,215,0.0
ST3000DM001,17637,118,0.01
Hitachi HDS5C3030ALA630,238029,107,0.0
Hitachi HDS722020ALA330,239624,145,0.0
Hitachi HDS5C4040ALE630,137497,27,0.0
HGST HMS5C4040ALE640,368313,123,0.0
HGST HMS5C4040BLE640,160979,1,0.0


In [18]:
df000.drop(['# failure percentage', '# total'], axis='columns')

Unnamed: 0_level_0,# failure
Unnamed: 0,Unnamed: 1_level_1
ST4000DM000,215
ST3000DM001,118
Hitachi HDS5C3030ALA630,107
Hitachi HDS722020ALA330,145
Hitachi HDS5C4040ALE630,27
HGST HMS5C4040ALE640,123
HGST HMS5C4040BLE640,1
