<font size = 6 color = green><b> Predicitive Maintenance / 智能性维护实例 </b></font>
# Step One: Load Load / 第一步： 读取数据 

# Libraries / 工具库

In [1]:
import os
import re
import numpy as np 
from pathlib import Path 
import zipfile
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

## Pre-requisite / 准备工作
* Download / 下载压缩数据 from https://onedrive.live.com/?cid=7CAD6DA55D313592&id=7CAD6DA55D313592%21159&parId=7CAD6DA55D313592%21158&o=OneUp 
* Save CMAPSS zipfile to  C:/pdm/zipraw / 把下载的压缩文件存放在 C:/pdm/zipraw    
  （ or / 或 d:/pdm/zipraw）
* Unzipped data will be stored in raw data folder 


## Unzip Ulitity / 解压
* Use zipfile library to unzip / 用 zipfile 工具包解压

In [12]:
def unzip_files(zip_file_name = None, unzip_to_folder = None, remove_zipped = False):
    if not zip_file_name: 
        current_folder = os.getcwd() # or: os.path.dirname(current_folder)
        parent_folder = Path(os.getcwd()).parent.absolute() 
        zip_file_name = Path(parent_folder ,  "zipraw/CMAPSS.zip")
    if not unzip_to_folder:
        unzip_to_folder = str(parent_folder) +  "/raw_data"

    if not os.path.exists(unzip_to_folder):
        os.makedirs(unzip_to_folder)

    with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
        zip_ref.extractall(unzip_to_folder)

    if remove_zipped: 
        os.remove(zip_file_name)

    return [unzip_to_folder + "/" + file for file in os.listdir(unzip_to_folder)]


## File System Manipulations / 文件处理
* Use regex / 用Regex

In [42]:
def list_files(raw_file_folder = None):
    if not raw_file_folder:
        raw_file_folder = unzip_to_folder = os.getcwd() +  "/raw_data"
    return [raw_file_folder + "/" + file for file in os.listdir(raw_file_folder)]

In [78]:
def get_files_regex(raw_file_folder = None, file_name_str = "test"):
    if not raw_file_folder:
        raw_file_folder = unzip_to_folder = os.getcwd() +  "/raw_data" 
    raw_files = list_files(raw_file_folder = raw_file_folder)
    regex = re.compile(f".+{file_name_str}.+gz")  
    raw_data_files = [f for f in raw_files if re.match(regex, f)]
    return raw_data_files

## Load to dataframe / 读取到 pandas DataFrames

In [340]:
def read_data_files(raw_file_folder = None, file_name_str = "train", use_pd = True, sep = " ", columns = None):  
    if not columns:
        columns=["id","cycle","op1","op2","op3","sensor1","sensor2","sensor3","sensor4","sensor5","sensor6","sensor7","sensor8",
            "sensor9","sensor10","sensor11","sensor12","sensor13","sensor14","sensor15","sensor16","sensor17","sensor18","sensor19"
            ,"sensor20","sensor21" ]  

    raw_data_files = get_files_regex(raw_file_folder = raw_file_folder, file_name_str =file_name_str) 

    df_total =  pd.DataFrame() 
    for f in raw_data_files: 
        if use_pd:
            df_ = pd.read_csv(f, compression='gzip',index_col = False, names = columns, sep=' ')
        else: 
            df_= pd.DataFrame(np.loadtxt(f), columns=columns) 
        df_[["id", "cycle"]] = df_[["id", "cycle"]].astype(int)
            
        flag = re.findall(r"FD\d{3}", str(f))[0]
        df_["Flag"] = flag 
        if df_total.empty:
            df_total = df_.copy()
        else: 
            df_total = pd.concat([df_total, df_], axis = 0 ) 
    
    return df_total 



In [341]:
def read_result(raw_file_folder = None, file_name_str = "RUL_FD", use_pd = True, sep = " ", columns = None):
    raw_data_files = get_files_regex(raw_file_folder = raw_file_folder, file_name_str =file_name_str)  
    if not columns:
        columns = ["rul"]

    df_result =  pd.DataFrame() 
    for f in raw_data_files:
        if use_pd: 
            df_ = pd.read_csv(f, compression='gzip', index_col = False, names = columns, sep = sep)
        else:
            df_= pd.DataFrame(np.loadtxt(f), columns = columns) 
        flag = re.findall(r"FD\d{3}", str(f))[0]
        df_["Flag"] = flag 
        if df_result.empty:
            df_result = df_.copy()
        else: 
            df_result = pd.concat([df_result, df_], axis = 0 ) 
    return df_result
    

## Sum Up: prepare train, test and result file / 主程序

In [357]:
def prepare_dfs(raw_file_folder = None, use_pd = True, sep = " "):
     if not raw_file_folder:
          raw_file_folder = unzip_to_folder = os.getcwd() +  "/raw_data"

     raw_files = list_files(raw_file_folder = raw_file_folder)

     columns=["id","cycle","op1","op2","op3","sensor1","sensor2","sensor3","sensor4","sensor5","sensor6","sensor7","sensor8",
          "sensor9","sensor10","sensor11","sensor12","sensor13","sensor14","sensor15","sensor16","sensor17","sensor18","sensor19"
          ,"sensor20","sensor21" ] 

     # Train
     df_train = read_data_files(raw_file_folder = raw_file_folder,
          file_name_str = "train", use_pd = use_pd, sep = " ", columns = columns)
     # Test
     df_test = read_data_files(raw_file_folder = raw_file_folder,
          file_name_str = "test", use_pd = use_pd, sep = " ", columns = columns)

     resul_columns = ["rul"]
     df_result = read_result(raw_file_folder = raw_file_folder, file_name_str = "RUL_FD", \
          use_pd = use_pd, sep =sep, columns = resul_columns)
 
     df_train.iloc[:, [0,1]] = df_train.iloc[:, [0,1]].astype(int)
     df_test.iloc[:, [0,1]] = df_test.iloc[:, [0,1]].astype(int) 

     df_max = df_test.groupby(["Flag","id"])["cycle"].max().reset_index()
     df_result = df_result.reset_index()
     df_result["id"] = df_result.groupby("Flag")["index"].rank("first", ascending = True).astype(int)
     df_result.drop(columns = ["index"], inplace = True)
     
     df_result = df_result.merge(df_max, on = ["Flag", "id"], how = "inner")
      
     df_result["rul_failed"] = df_result["rul"] + df_result["cycle"]

     df_test = df_test.merge(df_result[["rul_failed", "Flag", "id"]], on = ["Flag", "id"], how = "inner")
     df_test["remaining_rul"] = df_test["rul_failed"] - df_test["cycle"]

     #df_test[["rul_failed", "remaining_rul"]] = df_test[["rul_failed", "remaining_rul"]].astype(int)
     return df_train, df_test, df_result


## Call  prepare_dfs（）  / 调用主程序

In [361]:
df_train, df_test, df_result = prepare_dfs(use_pd=True) 