In [1]:
import numpy as np
import pandas as pd
import os
from importlib import reload
import find_cpt
from rgf.sklearn import RGFClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
data_dir = "./data"

In [65]:
def model_stat(model, X, y):
    f_score = np.average(cross_val_score(model, X, y, cv=5, scoring='f1', n_jobs=-1))
    r_score = np.average(cross_val_score(model, X, y, cv=5, scoring='recall', n_jobs=-1))
    p_score = np.average(cross_val_score(model, X, y, cv=5, scoring='precision', n_jobs=-1))
    print(f_score)
    print(r_score)
    print(p_score)

In [4]:
summarize = pd.read_csv('./preprocess/summarize.csv')

In [2]:
# Transfer learning!
test = pd.read_csv('./data/2017-01-01.csv')
SgtB_features = list(test.loc[test['model'] == 'ST8000DM002'].dropna(axis=1, how='all').columns.values)
def process_SgtB(df, name):
    df = df.loc[df['model'] == name]
    return df[SgtB_features]
type_dict = {feature: np.float32 for feature in SgtB_features[5:]}
SgtB = pd.concat([process_SgtB(pd.read_csv(os.path.join('./data/', filename), dtype=type_dict), 'ST8000DM002') for filename in os.listdir(data_dir)])
print(SgtB.shape)

# get fail name in B
fail_names_SgtB = SgtB.loc[SgtB['failure'] == 1]['serial_number'].unique()
print(fail_names_SgtB.size)
print(fail_names_SgtB.size / SgtB['serial_number'].unique().size)

(4412267, 53)
114
0.011396581025692293


In [16]:
print(SgtB_features)

['date', 'serial_number', 'model', 'capacity_bytes', 'failure', 'smart_1_normalized', 'smart_1_raw', 'smart_3_normalized', 'smart_3_raw', 'smart_4_normalized', 'smart_4_raw', 'smart_5_normalized', 'smart_5_raw', 'smart_7_normalized', 'smart_7_raw', 'smart_9_normalized', 'smart_9_raw', 'smart_10_normalized', 'smart_10_raw', 'smart_12_normalized', 'smart_12_raw', 'smart_184_normalized', 'smart_184_raw', 'smart_187_normalized', 'smart_187_raw', 'smart_188_normalized', 'smart_188_raw', 'smart_189_normalized', 'smart_189_raw', 'smart_190_normalized', 'smart_190_raw', 'smart_191_normalized', 'smart_191_raw', 'smart_192_normalized', 'smart_192_raw', 'smart_193_normalized', 'smart_193_raw', 'smart_194_normalized', 'smart_194_raw', 'smart_195_normalized', 'smart_195_raw', 'smart_197_normalized', 'smart_197_raw', 'smart_198_normalized', 'smart_198_raw', 'smart_199_normalized', 'smart_199_raw', 'smart_240_normalized', 'smart_240_raw', 'smart_241_normalized', 'smart_241_raw', 'smart_242_normalized

In [42]:
test = pd.read_csv('./data/2017-01-01.csv')
SgtA_features = list(test.loc[test['model'] == 'ST4000DM000'].dropna(axis=1, how='all').columns.values)
def process_SgtA(df, name):
    df = df.loc[df['model'] == name]
    return df[SgtA_features]
type_dict = {feature: np.float32 for feature in SgtA_features[5:]}
SgtA = pd.concat([process_SgtA(pd.read_csv(os.path.join(data_dir, filename), dtype=type_dict), 'ST4000DM000') for filename in os.listdir(data_dir)])
fail_names_SgtA = SgtA.loc[SgtA['failure'] == 1]['serial_number'].unique()
print(fail_names_SgtA.size)

1236


In [18]:
# prepare features
features_percent = summarize.T[0].iloc[1:]
selected_features = features_percent[features_percent > 0.01].sort_values(ascending=False)
# print(selected_features)

TR_selected_features = [i for i in selected_features.index.values if i in SgtB_features]
print(TR_selected_features)

31


In [37]:
compacted_info_SgtA = pd.read_csv('./preprocess/compacted.csv')
compacted_SgtA = compacted_info_SgtA[TR_selected_features]
compacted_SgtA['serial_number'] = compacted_info_SgtA['serial_number']
compacted_SgtA.to_csv('./preprocess/TR_compacted_SgtA.csv')
print(compacted_SgtA.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


(35209, 32)


In [28]:
# Compact info
def get_cmpt_info_SgtB(data):
    return pd.ewma(data.values, span=np.round(summarize.iloc[1][data.name]))[-1]
functions_group2 = {n: get_cmpt_info_SgtB for n in TR_selected_features}
# print(functions_group1)
compacted_SgtB = SgtB.groupby('serial_number', as_index=False).agg(functions_group2)
compacted_SgtB.to_csv('./preprocess/TR_compacted_SgtB.csv')
print(compacted_SgtB.shape)

  This is separate from the ipykernel package so we can avoid doing imports until


NameError: name 'compacted_SgtB' is not defined

In [38]:
print(compacted_SgtB.shape)
print(compacted_SgtA.shape)
print(len(TR_selected_features))

(10003, 32)
(35209, 32)
31


In [52]:
# prepare model
TR_X_health = compacted_SgtA.loc[~compacted_SgtA['serial_number'].isin(fail_names_SgtA)].drop('serial_number', axis=1).values
kmeans = KMeans(n_clusters=150, random_state=0, n_jobs=-1).fit(TR_X_health)

In [63]:
TR_X_health_transformed = np.concatenate([TR_X_health[np.argsort(kmeans.transform(TR_X_health)[:, j])[::-1][:10]] for j in range(0, 150)], axis=0)
TR_X_failed = compacted_SgtA.loc[compacted_SgtA['serial_number'].isin(fail_names_SgtA)].drop('serial_number', axis=1).values
TR_X_failed.shape
TR_X_train = np.concatenate((TR_X_health_transformed, TR_X_failed), axis=0)
TR_y_train = np.concatenate((np.zeros(TR_X_health_transformed.shape[0]), np.ones(TR_X_failed.shape[0])), axis=0) 
print(TR_X_train.shape)
print(TR_y_train.shape)

(2736, 31)
(2736,)


In [66]:
# tune RGF
search_grid = {
    'max_leaf': [1000],
    'algorithm': ['RGF_Sib'],
    'test_interval': [100],
    'loss': ['Log']
}

TR_rgf_model = RGFClassifier()
TR_rgf_grid = GridSearchCV(estimator= TR_rgf_model, param_grid=search_grid, 
    cv=5, scoring='f1', n_jobs=-1, verbose=2)
TR_rgf_grid.fit(TR_X_train, TR_y_train)
model_stat(TR_rgf_model, TR_X_train, TR_y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    3.4s remaining:    5.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.2s finished


0.9888837173463948
0.9910963823951938
0.9871212121212121


In [70]:
# test
TR_X_test = compacted_SgtB.drop('serial_number', axis=1).values
TR_y_test = np.array([1 if sn in fail_names_SgtB else 0 for sn in compacted_SgtB['serial_number'].values])
print(TR_X_test.shape)
print(TR_y_test.shape)
print(np.sum(TR_y_test))

(10003, 31)
(10003,)
114


In [78]:
TR_rgf_model = RGFClassifier(max_leaf=1000, algorithm='RGF_Sib', test_interval=100, loss='Log')
TR_rgf_model.fit(TR_X_train, TR_y_train)
TR_y_pred = TR_rgf_model.predict(TR_X_test)
print(f1_score(TR_y_test, TR_y_pred))
print(recall_score(TR_y_test, TR_y_pred))
print(precision_score(TR_y_test, TR_y_pred))

0.022347473548897462
0.9912280701754386
0.011301130113011301


In [85]:
print(np.sum(TR_y_pred))

9999.0
