In [1]:
import numpy as np
import pandas as pd
import os
from importlib import reload
import find_cpt
from rgf.sklearn import RGFClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
data_dir = "./data-2015"

In [2]:
def model_stat(model, X, y):
    f_score = np.average(cross_val_score(model, X, y, cv=5, scoring='f1', n_jobs=-1))
    r_score = np.average(cross_val_score(model, X, y, cv=5, scoring='recall', n_jobs=-1))
    p_score = np.average(cross_val_score(model, X, y, cv=5, scoring='precision', n_jobs=-1))
    print('f1 score:' + str(f_score))
    print('recall: ' + str(r_score))
    print('precision: ' + str(p_score))

In [3]:
# Transfer learning!
test = pd.read_csv('./data-2015/2015-01-01.csv')
SgtB_features = list(test.loc[test['model'] == 'ST31500541AS'].dropna(axis=1, how='all').columns.values)
def process_SgtB(df, name):
    df = df.loc[df['model'] == name]
    return df[SgtB_features]
type_dict = {feature: np.float32 for feature in SgtB_features[5:]}
SgtB = pd.concat([process_SgtB(pd.read_csv(os.path.join('./data-2015/', filename), dtype=type_dict), 'ST31500541AS') for filename in os.listdir(data_dir)])
print(SgtB.shape)

# get fail name in B
fail_names_SgtB = SgtB.loc[SgtB['failure'] == 1]['serial_number'].unique()
print(fail_names_SgtB.size)
print(fail_names_SgtB.size / SgtB['serial_number'].unique().size)

(366831, 49)
109
0.06438275251033668


In [4]:
# prepare features
test = pd.read_csv('./data-2015/2015-01-01.csv')
SgtB_features = list(test.loc[test['model'] == 'ST31500541AS'].dropna(axis=1, how='all').columns.values)
summarize = pd.read_csv('./preprocess/summarize.csv')
features_percent = summarize.T[0].iloc[1:]
selected_features = features_percent[features_percent > 0.01].sort_values(ascending=False)
# print(selected_features)
# irrelevent_features = ['smart_9_raw', 'smart_9_normalized', 'smart_4_raw', 'smart_4_normalized', 'smart_12_raw', 'smart_12_normalized']
# TR_selected_features = [i for i in selected_features.index.values if i in SgtB_features and i not in irrelevent_features]
TR_selected_features = [i for i in selected_features.index.values if i in SgtB_features]
print(len(TR_selected_features))

32


In [5]:
compacted_info_SgtA = pd.read_csv('./preprocess/compacted.csv')
fail_names_SgtA = compacted_info_SgtA.loc[compacted_info_SgtA['failure'] == 1]['serial_number'].unique()
compacted_SgtA = compacted_info_SgtA[TR_selected_features]
compacted_SgtA[['serial_number', 'failure']] = compacted_info_SgtA[['serial_number', 'failure']]
compacted_SgtA.to_csv('./preprocess/TR_compacted_SgtA.csv')
print(compacted_SgtA.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


(29670, 34)


In [6]:
# Compact info
def get_cmpt_info_SgtB(data):
    return pd.ewma(data.values, span=np.round(summarize.iloc[1][data.name]))[-1]
functions_group2 = {n: get_cmpt_info_SgtB for n in TR_selected_features}
# print(functions_group1)
compacted_SgtB = SgtB.groupby('serial_number', as_index=False).agg(functions_group2)
compacted_SgtB['failure'] = compacted_SgtB.apply(lambda row: 1 if row['serial_number'] in fail_names_SgtB else 0, axis=1)
compacted_SgtB.to_csv('./preprocess/TR_compacted_SgtB.csv')
print(compacted_SgtB.shape)

  This is separate from the ipykernel package so we can avoid doing imports until


(1693, 34)


In [29]:
compacted_SgtA = pd.read_csv('./preprocess/TR_compacted_SgtA.csv').drop(['Unnamed: 0'], axis=1)
compacted_SgtB = pd.read_csv('./preprocess/TR_compacted_SgtB.csv').drop(['Unnamed: 0'], axis=1)
compacted_SgtA = compacted_SgtA[TR_selected_features + ['serial_number', 'failure']]
compacted_SgtB = compacted_SgtB[TR_selected_features + ['serial_number', 'failure']]

fail_names_SgtA = compacted_SgtA.loc[compacted_SgtA['failure'] == 1]['serial_number'].unique()
fail_names_SgtB = compacted_SgtB.loc[compacted_SgtB['failure'] == 1]['serial_number'].unique()

# TR_selected_features = compacted_SgtB.columns.drop(['serial_number', 'failure'])

In [26]:
TR_selected_features =['smart_1_normalized',
 'smart_1_raw',
 'smart_3_normalized',
 'smart_4_raw',
 'smart_5_raw',
 'smart_7_normalized',
 'smart_7_raw',
 'smart_9_normalized',
 'smart_9_raw',
 'smart_12_raw',
 'smart_183_normalized',
 'smart_183_raw',
 'smart_187_normalized',
 'smart_187_raw',
 'smart_190_normalized',
 'smart_190_raw',
 'smart_194_normalized',
 'smart_194_raw',
 'smart_197_raw',
 'smart_198_raw',
 'smart_240_raw',
 'smart_241_raw']

In [30]:
print(compacted_SgtB.shape)
print(compacted_SgtA.shape)
print(len(TR_selected_features))

(1693, 24)
(29670, 24)
22


In [31]:
# prepare model
TR_X_health = compacted_SgtA.loc[~compacted_SgtA['serial_number'].isin(fail_names_SgtA)].drop(['serial_number', 'failure'], axis=1).values
kmeans = KMeans(n_clusters=150, random_state=0, n_jobs=-1).fit(TR_X_health)

In [32]:
TR_X_health_transformed = np.concatenate([TR_X_health[np.argsort(kmeans.transform(TR_X_health)[:, j])[::-1][:10]] for j in range(0, 100)], axis=0)
TR_X_failed = compacted_SgtA.loc[compacted_SgtA['serial_number'].isin(fail_names_SgtA)].drop(['serial_number', 'failure'], axis=1).values
TR_X_train = np.concatenate((TR_X_health_transformed, TR_X_failed), axis=0)
TR_y_train = np.concatenate((np.zeros(TR_X_health_transformed.shape[0]), np.ones(TR_X_failed.shape[0])), axis=0) 
print(TR_X_train.shape)
print(TR_y_train.shape)

(1586, 22)
(1586,)


In [33]:
TR_X_failed.shape

(586, 22)

In [34]:
# tune RGF
search_grid = {
    'max_leaf': [1000],
    'algorithm': ['RGF_Sib'],
    'test_interval': [100],
    'loss': ['Log']
}

TR_rgf_model = RGFClassifier()
TR_rgf_grid = GridSearchCV(estimator= TR_rgf_model, param_grid=search_grid, 
    cv=5, scoring='f1', n_jobs=-1, verbose=2)
TR_rgf_grid.fit(TR_X_train, TR_y_train)
model_stat(TR_rgf_model, TR_X_train, TR_y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.3s remaining:    3.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.3s finished


f1 score:0.9948348265944833
recall: 0.9897725626539187
precision: 1.0


In [35]:
# test
TR_X_test = compacted_SgtB.drop(['serial_number', 'failure'], axis=1).values
TR_y_test = np.array([1 if sn in fail_names_SgtB else 0 for sn in compacted_SgtB['serial_number'].values])
print(TR_X_test.shape)
print(TR_y_test.shape)
print(np.sum(TR_y_test))

(1693, 22)
(1693,)
109


In [36]:
TR_rgf_model = RGFClassifier(max_leaf=1000, algorithm='RGF_Sib', test_interval=100, loss='Log')
TR_rgf_model.fit(TR_X_train, TR_y_train)
TR_y_pred = TR_rgf_model.predict(TR_X_test)
print('f1: ' + str(f1_score(TR_y_test, TR_y_pred)))
print('recall: ' + str(recall_score(TR_y_test, TR_y_pred)))
print('precision: ' + str(precision_score(TR_y_test, TR_y_pred)))

f1: 0.11678004535147393
recall: 0.944954128440367
precision: 0.062235649546827795


In [37]:
# tune SVM
search_grid = {
    'C': [1.05], 
    'kernel': ['rbf'],
    'gamma': [0.05],
    'class_weight': ['balanced'],
    'max_iter': [-1],
    'random_state': [0]
}
svm_model = SVC()
svm_grid = GridSearchCV(estimator=svm_model, param_grid=search_grid, 
    cv=5, scoring='f1', n_jobs=-1, verbose=2)
svm_grid.fit(TR_X_train, TR_y_train)
model_stat(svm_model, TR_X_train, TR_y_train)
print(svm_grid.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.4s remaining:    2.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.1s finished


f1 score:0.99915611814346
recall: 1.0
precision: 0.9983193277310924
{'C': 1.05, 'class_weight': 'balanced', 'gamma': 0.05, 'kernel': 'rbf', 'max_iter': -1, 'random_state': 0}


In [38]:
TR_svm_model = svm_model = SVC(C=1.05, class_weight='balanced', gamma=0.05, max_iter=-1, random_state=0)
TR_svm_model.fit(TR_X_train, TR_y_train)
TR_y_pred = TR_svm_model.predict(TR_X_test)
print('f1: ' + str(f1_score(TR_y_test, TR_y_pred)))
print('recall: ' + str(recall_score(TR_y_test, TR_y_pred)))
print('precision: ' + str(precision_score(TR_y_test, TR_y_pred)))

f1: 0.12097669256381798
recall: 1.0
precision: 0.06438275251033668


In [47]:
# do transfer learning
TR_X_health = compacted_SgtA.loc[~compacted_SgtA['serial_number'].isin(fail_names_SgtA)].drop(['serial_number', 'failure'], axis=1).values

X_all = np.concatenate([TR_X_health, TR_X_test], axis=0)
y_all = np.concatenate([np.zeros(TR_X_health.shape[0]), np.ones(TR_X_test.shape[0])], axis=0)
print(X_all.shape)
print(y_all.shape)

(30777, 22)
(30777,)


In [None]:
TR_svm_model = SVC(probability=True)
model_stat(TR_svm_model, X_all, y_all)

In [None]:
TR_svm_model.fit(X_all, y_all)
prob = TR_svm_model.predict_proba(TR_X_health_transformed)[:, 1]
print(prob)
# X_train = TR_X_train[np.argsort(prob)[:500]]
# y_train = TR_y_train[np.argsort(prob)[:500]]
# sum(y_train)
# TR_X_health_transformed = np.concatenate([TR_X_health[np.argsort(kmeans.transform(TR_X_health)[:, j])[::-1][:10]] for j in range(0, 100)], axis=0)