<font size = 6 color = green><b> Predicitive Maintenance / 智能性维护实例 </b></font>
# Menu A-a: Load Load / 读取数据 

# Libraries / 工具库

In [1]:
import os
import re
import numpy as np 
from pathlib import Path 
import zipfile
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

## Pre-requisite / 准备工作
* Download / 下载压缩数据 from https://onedrive.live.com/?cid=7CAD6DA55D313592&id=7CAD6DA55D313592%21159&parId=7CAD6DA55D313592%21158&o=OneUp 
* Save CMAPSS zipfile to  C:/pdm/zipraw / 把下载的压缩文件存放在 C:/pdm/zipraw    
  （ or / 或 d:/pdm/zipraw）
* Unzipped data will be stored in raw data folder 


## Prepare folders / 准备文件夹

In [2]:
def get_file_paths(data_parent_folder = None):
    file_paths = {}
    if not data_parent_folder:
        data_parent_folder = os.path.dirname(os.getcwd())
    file_paths["parent_folder"] = data_parent_folder
    file_paths["raw_data_path"] = data_parent_folder + '/raw_data'
    file_paths["zip_data_path"] = data_parent_folder + '/zipraw'
    file_paths["unzip_to_path"] = data_parent_folder + '/raw_data'
    return file_paths 

### execute for this notebook/执行

In [3]:

FILE_PATHS = get_file_paths()
print(FILE_PATHS)

{'parent_folder': 'c:\\classes\\pdm', 'raw_data_path': 'c:\\classes\\pdm/raw_data', 'zip_data_path': 'c:\\classes\\pdm/zipraw', 'unzip_to_path': 'c:\\classes\\pdm/raw_data'}


## Unzip Ulitity / 解压
* Use zipfile library to unzip / 用 zipfile 工具包解压

In [4]:
def unzip_files(zip_file_name = None,  remove_zipped = False):
    if not zip_file_name:  
        zip_file_name = f'{FILE_PATHS["zip_data_path"]}/CMAPSS.zip' 

    if not os.path.exists(FILE_PATHS["zip_data_path"]):
        os.makedirs(FILE_PATHS["zip_data_path"])

    with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
        zip_ref.extractall(FILE_PATHS["unzip_to_path"])

    if remove_zipped: 
        os.remove(zip_file_name)

    return [FILE_PATHS["unzip_to_path"] + "/" + file for file in os.listdir(FILE_PATHS["unzip_to_path"])]


## File System Manipulations / 文件处理
* Use regex / 用Regex

In [5]:
def list_data_files(): 
    return [FILE_PATHS["raw_data_path"] + "/" + file for file in os.listdir(FILE_PATHS["raw_data_path"])] 

In [6]:
def get_files_regex(file_name_str = "test"): 
    raw_files = list_data_files()
    regex = re.compile(f".+{file_name_str}.+gz")  
    raw_data_files = [f for f in raw_files if re.match(regex, f)]
    return raw_data_files

## Load to dataframe / 读取到 pandas DataFrames

In [7]:
def read_data_files(file_name_str = "train", use_pd = True, sep = " ", columns = None):  
    if not columns:
        columns=["id","cycle","op1","op2","op3","sensor1","sensor2","sensor3","sensor4","sensor5","sensor6","sensor7","sensor8",
            "sensor9","sensor10","sensor11","sensor12","sensor13","sensor14","sensor15","sensor16","sensor17","sensor18","sensor19"
            ,"sensor20","sensor21" ]  

    raw_data_files = get_files_regex(file_name_str =file_name_str) 

    df_total =  pd.DataFrame() 
    for f in raw_data_files: 
        if use_pd:
            df_ = pd.read_csv(f, compression='gzip',index_col = False, names = columns, sep=' ')
        else: 
            df_= pd.DataFrame(np.loadtxt(f), columns=columns) 
        df_[["id", "cycle"]] = df_[["id", "cycle"]].astype(int)
            
        flag = re.findall(r"FD\d{3}", str(f))[0]
        df_["Flag"] = flag 
        if df_total.empty:
            df_total = df_.copy()
        else: 
            df_total = pd.concat([df_total, df_], axis = 0 ) 
    
    return df_total 



In [8]:
def read_result(file_name_str = "RUL_FD", use_pd = True, sep = " ", columns = None):
    raw_data_files = get_files_regex(file_name_str =file_name_str)  
    if not columns:
        columns = ["rul"]

    df_result =  pd.DataFrame() 
    for f in raw_data_files:
        if use_pd: 
            df_ = pd.read_csv(f, compression='gzip', index_col = False, names = columns, sep = sep)
        else:
            df_= pd.DataFrame(np.loadtxt(f), columns = columns) 
        flag = re.findall(r"FD\d{3}", str(f))[0]
        df_["Flag"] = flag 
        if df_result.empty:
            df_result = df_.copy()
        else: 
            df_result = pd.concat([df_result, df_], axis = 0 ) 
    return df_result
    

## Sum Up: prepare train, test and result file / 主程序

In [9]:
def prepare_dfs(use_pd = True, sep = " "): 
      
     columns=["id","cycle","op1","op2","op3","sensor1","sensor2","sensor3","sensor4","sensor5","sensor6","sensor7","sensor8",
          "sensor9","sensor10","sensor11","sensor12","sensor13","sensor14","sensor15","sensor16","sensor17","sensor18","sensor19"
          ,"sensor20","sensor21" ] 

     # Train
     df_train = read_data_files( file_name_str = "train", use_pd = use_pd, sep = " ", columns = columns)
     # Test
     df_test = read_data_files( file_name_str = "test", use_pd = use_pd, sep = " ", columns = columns)

     resul_columns = ["rul"]
     df_result = read_result(file_name_str = "RUL_FD", \
          use_pd = use_pd, sep =sep, columns = resul_columns)
 
     df_train.iloc[:, [0,1]] = df_train.iloc[:, [0,1]].astype(int)
     df_test.iloc[:, [0,1]] = df_test.iloc[:, [0,1]].astype(int) 

     df_max = df_test.groupby(["Flag","id"])["cycle"].max().reset_index()
     df_result = df_result.reset_index()
     df_result["id"] = df_result.groupby("Flag")["index"].rank("first", ascending = True).astype(int)
     df_result.drop(columns = ["index"], inplace = True)
     
     df_result = df_result.merge(df_max, on = ["Flag", "id"], how = "inner")
      
     df_result["rul_failed"] = df_result["rul"] + df_result["cycle"]

     df_test = df_test.merge(df_result[["rul_failed", "Flag", "id"]], on = ["Flag", "id"], how = "inner") # left, right outer 
     df_test["remaining_rul"] = df_test["rul_failed"] - df_test["cycle"]

     #df_test[["rul_failed", "remaining_rul"]] = df_test[["rul_failed", "remaining_rul"]].astype(int)
     return df_train, df_test, df_result


## Call  prepare_dfs（）  / 调用主程序

In [10]:
df_train, df_test, df_result = prepare_dfs(use_pd=True)

In [13]:
df_train.sample(10)

Unnamed: 0,id,cycle,op1,op2,op3,sensor1,sensor2,sensor3,sensor4,sensor5,...,sensor13,sensor14,sensor15,sensor16,sensor17,sensor18,sensor19,sensor20,sensor21,Flag
40904,166,124,10.0023,0.25,100.0,489.05,604.52,1502.61,1304.45,10.52,...,2388.2,8140.13,8.6553,0.03,369,2319,100.0,28.58,17.1559,FD004
6478,24,165,0.002,0.0005,100.0,518.67,641.49,1582.16,1399.97,14.62,...,2387.93,8132.55,8.3799,0.03,390,2388,100.0,38.94,23.2968,FD003
20933,104,69,0.0029,0.0007,100.0,518.67,642.73,1578.86,1401.95,14.62,...,2387.96,8145.07,8.4065,0.03,391,2388,100.0,39.14,23.3784,FD002
10416,39,70,-0.0025,-0.0005,100.0,518.67,641.91,1579.88,1398.64,14.62,...,2387.91,8134.95,8.3981,0.03,390,2388,100.0,39.06,23.3708,FD003
19601,79,136,-0.0006,0.0001,100.0,518.67,643.57,1595.64,1409.36,14.62,...,2388.12,8138.04,8.4491,0.03,396,2388,100.0,38.8,23.1941,FD003
29007,118,327,35.0076,0.8418,100.0,449.44,555.19,1366.26,1130.68,5.48,...,2388.17,8075.49,9.1887,0.02,332,2223,100.0,15.05,8.9119,FD004
16161,82,89,42.0065,0.84,100.0,445.0,549.6,1353.67,1124.35,3.91,...,2387.95,8094.24,9.3551,0.02,332,2212,100.0,10.63,6.2977,FD002
51169,207,214,25.0055,0.62,60.0,462.54,536.59,1264.52,1037.98,7.05,...,2028.32,7877.46,10.8178,0.02,306,1915,84.93,14.32,8.7495,FD004
14646,75,85,20.002,0.7017,100.0,491.19,607.4,1488.67,1258.54,9.35,...,2388.11,8061.79,9.2053,0.02,365,2324,100.0,24.68,14.6908,FD002
44683,180,105,42.0043,0.84,100.0,445.0,548.82,1347.89,1119.74,3.91,...,2388.06,8083.47,9.3099,0.02,329,2212,100.0,10.62,6.4925,FD004


In [14]:
df_test.sample(20)

Unnamed: 0,id,cycle,op1,op2,op3,sensor1,sensor2,sensor3,sensor4,sensor5,...,sensor15,sensor16,sensor17,sensor18,sensor19,sensor20,sensor21,Flag,rul_failed,remaining_rul
79163,87,118,42.0039,0.8415,100.0,445.0,549.1,1336.5,1119.76,3.91,...,9.301,0.02,329,2212,100.0,10.82,6.3943,FD004,261,143
14037,7,167,19.999,0.7004,100.0,491.19,607.76,1489.71,1263.41,9.35,...,9.261,0.03,368,2324,100.0,24.33,14.5092,FD002,190,23
35682,168,51,19.9982,0.7,100.0,491.19,607.86,1480.07,1243.75,9.35,...,9.2173,0.02,366,2324,100.0,24.54,14.7378,FD002,189,138
27931,110,60,42.0043,0.84,100.0,445.0,549.28,1341.81,1115.51,3.91,...,9.3681,0.02,330,2212,100.0,10.65,6.2927,FD002,299,239
1433,13,46,-0.0007,0.0001,100.0,518.67,642.36,1582.3,1399.98,14.62,...,8.4033,0.03,392,2388,100.0,39.02,23.4522,FD001,290,244
100883,222,147,42.0009,0.84,100.0,445.0,549.23,1344.51,1120.67,3.91,...,9.324,0.02,329,2212,100.0,10.74,6.4559,FD004,217,70
68658,29,193,42.0038,0.84,100.0,445.0,548.46,1348.41,1125.06,3.91,...,9.34,0.02,329,2212,100.0,10.66,6.3195,FD004,280,87
42646,225,146,0.002,0.0013,100.0,518.67,643.37,1593.57,1416.66,14.62,...,8.4798,0.03,395,2388,100.0,38.58,23.2513,FD002,177,31
53045,36,4,0.0013,-0.0005,100.0,518.67,642.13,1575.5,1390.12,14.62,...,8.4205,0.03,392,2388,100.0,39.04,23.4412,FD003,227,223
18840,44,74,34.9995,0.84,100.0,449.44,556.03,1359.91,1130.69,5.48,...,9.2906,0.02,332,2223,100.0,14.98,8.9765,FD002,170,96


In [15]:
df_result.head(2)

Unnamed: 0,rul,Flag,id,cycle,rul_failed
0,112,FD001,1,31,143
1,98,FD001,2,49,147
