In [1]:
# Install package
!pip install pydrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive 
from google.colab import auth 
from oauth2client.client import GoogleCredentials

# auth
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Retrieve disk.csv from Google Drive link
file_id = '1mpo6HJ-VZwrrnNIxcNuT_GfiKRZgbEco'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('disk.csv')



# Outline
1. Import Packages
2. Preprocessing
3. Sample Prepration
4. Build Model

# 1. Import Packages

In [2]:
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler

# Techniques
from sklearn.model_selection import KFold, cross_validate, GridSearchCV

# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC

# Metrics
from sklearn.metrics import accuracy_score

seed = 42
n_splits = 3

# 2. Preprocessing

### Data Cleaning
1. Retrieve data from model "ST4000DM000"
2. Drop 'normalized' columns and fill in zeros to numerical columns
3. Size of final dataframe: (31414293, 49)

### Data Prepration
1. Calculate number of failed and not failed disks
2. Remove some non-failed disks to reduce dataset size(broken disks constitute only ~7% of all disks)
    - Num of disks: 36700
    - Num of failed disks: 2587
    - Percentage of broken disks: 7.05%
3. Ensure again that number of “not failed” samples corresponds roughly to number of “failed” samples
    - Num of failed disks: 2587
    - Num of OK disks: 2587
    - Size of final dataframe: (3579161, 49) --> Dataset size reduction: 88%
4. Set sample_days=5, predict_failure_days=5
    - sample_days: how many days we want to have in each sample
    - predict_failure_days: how many days before the failure day we treat as “fails soon “days
5. Remove sequences of data that are shorter than sample_days
6. Select failed samples and non-failed samples(both classes should be balanced)
    - failed samples: for which we know they finally failed
    - non-failed samples(random selection to ensure better variability in data)
        - samples related to disks that never failed
        - samples related to disks that will fail but "later"

# 3. Sample Prepration

### 3.1 Read CSV

In [3]:
df = pd.read_pickle('disk.csv')
print ('Total size of dataframe:', df.shape)

Total size of dataframe: (127280, 55)


In [4]:
df.head()

Unnamed: 0,date,weekday,serial_number,capacity_bytes,failure,fails_soon,seq_id,work_day,max_work_day,final_failure,smart_1_raw,smart_2_raw,smart_3_raw,smart_4_raw,smart_5_raw,smart_7_raw,smart_8_raw,smart_9_raw,smart_10_raw,smart_11_raw,smart_12_raw,smart_13_raw,smart_15_raw,smart_22_raw,smart_183_raw,smart_184_raw,smart_187_raw,smart_188_raw,smart_189_raw,smart_190_raw,smart_191_raw,smart_192_raw,smart_193_raw,smart_194_raw,smart_195_raw,smart_196_raw,smart_197_raw,smart_198_raw,smart_199_raw,smart_200_raw,smart_201_raw,smart_220_raw,smart_222_raw,smart_223_raw,smart_224_raw,smart_225_raw,smart_226_raw,smart_240_raw,smart_241_raw,smart_242_raw,smart_250_raw,smart_251_raw,smart_252_raw,smart_254_raw,smart_255_raw
0,2017-10-17,1,Z306MW5B,4000787030016,0,0,5212,419,427,1,166497128,0,0,4,0,339701174,0,10137,0,0,4,0,0,0,0,0,0,0,0,22,0,0,1738,22,0,0,0,0,0,0,0,0,0,0,0,0,0,10130,9856527033,13187289939,0,0,0,0,0
1,2017-10-18,2,Z306MW5B,4000787030016,0,0,5212,420,427,1,51905616,0,0,4,0,340455664,0,10160,0,0,4,0,0,0,0,0,0,0,0,22,0,0,1738,22,0,0,0,0,0,0,0,0,0,0,0,0,0,10154,9858620465,13220534904,0,0,0,0,0
2,2017-10-19,3,Z306MW5B,4000787030016,0,0,5212,421,427,1,147568504,0,0,4,0,341490395,0,10190,0,0,4,0,0,0,0,0,0,0,0,22,0,0,1738,22,0,0,0,0,0,0,0,0,0,0,0,0,0,10184,9861493167,13264240575,0,0,0,0,0
3,2017-10-20,4,Z306MW5B,4000787030016,0,0,5212,422,427,1,4599640,0,0,4,0,342234422,0,10214,0,0,4,0,0,0,0,0,0,0,0,22,0,0,1738,22,0,0,0,0,0,0,0,0,0,0,0,0,0,10207,9863800547,13292665872,0,0,0,0,0
4,2017-10-21,5,Z306MW5B,4000787030016,0,1,5212,423,427,1,95941328,0,0,4,0,342967290,0,10238,0,0,4,0,0,0,0,0,0,0,0,22,0,0,1738,22,0,0,0,0,0,0,0,0,0,0,0,0,0,10231,9865893335,13320760608,0,0,0,0,0


### 3.2 Data Splitting

In [5]:
# Retrieve unique "serial_number"
disks_list = shuffle(df['serial_number'].unique())

# Split Disks into train/test 
test_split = 0.2
split_position = int((1-test_split)*len(disks_list))
train_disks_list = disks_list[:split_position]
test_disks_list = disks_list[split_position:]
print ('Num of train disks:', len(train_disks_list))
print ('Num of test disks:', len(test_disks_list))
print ('Intersection between train/test:', set(train_disks_list).intersection(set(test_disks_list)))

Num of train disks: 4105
Num of test disks: 1027
Intersection between train/test: set()


In [6]:
df_train = df[df['serial_number'].isin(train_disks_list)]
df_test = df[df['serial_number'].isin(test_disks_list)]
print ('Total size of train dataframe:', df_train.shape)
print ('Total size of test dataframe:', df_test.shape)

Total size of train dataframe: (102000, 55)
Total size of test dataframe: (25280, 55)


### 3.3 Retrieve Indices Every "sample_days"

In [7]:
# "sample_days" defines how many days we want to have in each sample
sample_days = 5

# Train index
IDX_train = df_train.iloc[:,[0,2]].values
IDX_train = IDX_train[::sample_days]

# Test index
IDX_test = df_test.iloc[:,[0,2]].values
IDX_test = IDX_test[::sample_days]

### 3.4 Feature Engineering and Data Normalization
### 3.4.1 Drop redundant columns

In [8]:
drop_columns_list = ['date', 'serial_number', 'failure', 'fails_soon', 'seq_id', 'max_work_day', 'final_failure']
X_train = df_train.drop(columns=drop_columns_list)
X_test = df_test.drop(columns=drop_columns_list)

In [9]:
X_train.head()

Unnamed: 0,weekday,capacity_bytes,work_day,smart_1_raw,smart_2_raw,smart_3_raw,smart_4_raw,smart_5_raw,smart_7_raw,smart_8_raw,smart_9_raw,smart_10_raw,smart_11_raw,smart_12_raw,smart_13_raw,smart_15_raw,smart_22_raw,smart_183_raw,smart_184_raw,smart_187_raw,smart_188_raw,smart_189_raw,smart_190_raw,smart_191_raw,smart_192_raw,smart_193_raw,smart_194_raw,smart_195_raw,smart_196_raw,smart_197_raw,smart_198_raw,smart_199_raw,smart_200_raw,smart_201_raw,smart_220_raw,smart_222_raw,smart_223_raw,smart_224_raw,smart_225_raw,smart_226_raw,smart_240_raw,smart_241_raw,smart_242_raw,smart_250_raw,smart_251_raw,smart_252_raw,smart_254_raw,smart_255_raw
25,1,4000787030016,4,85166192,0,0,10,0,459924,0,90,0,0,10,0,0,0,0,0,0,0,0,25,0,3,799,25,0,0,0,0,0,0,0,0,0,0,0,0,0,80,9784146708,21216517,0,0,0,0,0
26,2,4000787030016,5,5087080,0,0,10,0,488781,0,115,0,0,10,0,0,0,0,0,0,0,0,27,0,3,1863,27,0,0,0,0,0,0,0,0,0,0,0,0,0,97,9784187973,1156500519,0,0,0,0,0
27,3,4000787030016,6,105506648,0,0,10,0,560882,0,139,0,0,10,0,0,0,0,0,0,0,0,27,0,3,1863,27,0,0,0,0,0,0,0,0,0,0,0,0,0,121,9784341495,5400088535,0,0,0,0,0
28,4,4000787030016,7,105684584,0,0,10,0,602250,0,163,0,0,10,0,0,0,0,0,0,0,0,25,0,3,2463,25,0,0,0,0,0,0,0,0,0,0,0,0,0,140,9784429668,7837950271,0,0,0,0,0
29,5,4000787030016,8,109906456,0,0,10,0,602326,0,187,0,0,10,0,0,0,0,0,0,0,0,25,0,3,3932,25,0,0,0,0,0,0,0,0,0,0,0,0,0,154,9784429740,7842079063,0,0,0,0,0


### 3.4.2 Specify Features and Labels

In [10]:
Y_train = df_train.iloc[:,5]
Y_test = df_test.iloc[:,5]   # Retrieve column "fails_soon" as label

In [11]:
Y_train.head()

25    0
26    0
27    0
28    0
29    1
Name: fails_soon, dtype: int64

### 3.4.3 Data Standardization

In [12]:
# Cast X train to float64, data norm, and reshape
X_train = X_train.values
X_train = X_train.astype(np.float64)
standard_scaler = StandardScaler().fit(X_train)
X_train = standard_scaler.transform(X_train)
X_train = X_train.reshape(int(X_train.shape[0]/sample_days), sample_days, X_train.shape[1])
print ('Shape of train X:', X_train.shape)

# Cast X test to float64, data norm, and reshape
X_test = X_test.values
X_test = X_test.astype(np.float64)
X_test = standard_scaler.transform(X_test)
X_test = X_test.reshape(int(X_test.shape[0]/sample_days), sample_days, X_test.shape[1])
print ('Shape of test X:', X_test.shape)

# Retrieve Y train every sample_days
Y_train = Y_train.values
Y_train = Y_train[sample_days-1::sample_days]
print ('Shape of train Y:', Y_train.shape)

# Retrieve Y test every sample_days
Y_test = Y_test.values
Y_test = Y_test[sample_days-1::sample_days]
print ('Shape of test Y:', Y_test.shape)

print ('Failed disks in train:', (Y_train == 1).sum())
print ('Non-failed disks in train:', (Y_train == 0).sum())
print ('Failed disks in test:', (Y_test == 1).sum())
print ('Non-failed disks in test:', (Y_test == 0).sum())

Shape of train X: (20400, 5, 48)
Shape of test X: (5056, 5, 48)
Shape of train Y: (20400,)
Shape of test Y: (5056,)
Failed disks in train: 10187
Non-failed disks in train: 10213
Failed disks in test: 2541
Non-failed disks in test: 2515


### 3.5 Shuffle Data

In [13]:
IDX_train, X_train, Y_train = shuffle(IDX_train, X_train, Y_train)
IDX_test, X_test, Y_test = shuffle(IDX_test, X_test, Y_test)

# 4. Build Model

### 4.1 Transfrom Sequential Data to Non-Sequential Handcrafted Features

In [14]:
# Concat mean, min, max, and std as features
X_train_flatten, X_test_flatten = [], []

for row in X_train:
    out = np.concatenate((row.mean(0), row.min(0), row.max(0), row.std(0)), axis=0).reshape(4*row.shape[1])
    X_train_flatten.append(out)  

for row in X_test:
    out = np.concatenate((row.mean(0), row.min(0), row.max(0), row.std(0)), axis=0).reshape(4*row.shape[1])
    X_test_flatten.append(out)  

X_train_flatten = np.stack(X_train_flatten)
X_test_flatten = np.stack(X_test_flatten)
print ('Shape of flattened train X:', X_train_flatten.shape)
print ('Shape of flattened test X:', X_test_flatten.shape)

Shape of flattened train X: (20400, 192)
Shape of flattened test X: (5056, 192)


### 4.2 Try to write your classifier

In [15]:
### Write K-Fold cross validation
### Ref: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
# kfold = KFold(...)

### Define your classifier(such as random forest)
### Ref: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
# model = RandomForestClassifier(...)

### Use K-Fold cross validation to prevent model from overfitting
### Ref: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html
# results = cross_validate(...)
# print ('Best accuracy on fold {}: {:.5f}'.format(results['test_score'].argmax(), results['test_score'].max()))

### Fit model using the entire training set, including features and labels
### See "Methods" parts of ref

### Test on testing data
### See "Methods" parts of ref

### Show accuracy, precision, recall, ..., whatever you want to measure
### Ref: https://scikit-learn.org/stable/modules/model_evaluation.html
# print (accuracy_score(...), precision_score(...), ...)

### 4.3 Explore Hyperparameters using Grid Search

In [16]:
# K-Fold cross validation
kfold = KFold(n_splits=n_splits, shuffle=False, random_state=seed)

# Define classifier
model = RandomForestClassifier(random_state=seed)

# Search hparams in this grid
forest_params = {
    'max_features': ['auto'],
    'n_estimators': [100, 200],
    'criterion': ['entropy'],
    'min_samples_leaf': [1],
}

# Find the best hparams based on the given grid
forest_model = GridSearchCV(estimator=model, param_grid=forest_params, cv=kfold, scoring='accuracy')

# Use the best hparams to fit on the training set
forest_model.fit(X_train_flatten, Y_train)
print ('Score of the best model:', forest_model.best_score_)  
print ('Params of the best model:', forest_model.best_params_) 
# print ('Params of the best model:\n', forest_model.best_estimator_)
# print ('Results of cross validation:\n', forest_model.cv_results_)

# Run on testing data
print ('Accuracy on testing set:', forest_model.score(X_test_flatten, Y_test))



Score of the best model: 0.9259803921568627
Params of the best model: {'criterion': 'entropy', 'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 200}
Accuracy on testing set: 0.7828322784810127
