In [1]:
import pandas as pd
import numpy as np

labels_df=pd.read_excel("ME_F321_dataset/train/labels.xlsx",skiprows=1)
labels_df = labels_df.rename(columns={'Unnamed: 0': 'Case#', 'Unnamed: 1': 'Spacecraft#', 'Unnamed: 2':"Condition"})

labels_df.head()

Unnamed: 0,Case#,Spacecraft#,Condition,SV1,SV2,SV3,SV4,BP1,BP2,BP3,BP4,BP5,BP6,BP7,BV1
0,1,1,Normal,100,100,100,100,No,No,No,No,No,No,No,No
1,2,1,Normal,100,100,100,100,No,No,No,No,No,No,No,No
2,3,1,Normal,100,100,100,100,No,No,No,No,No,No,No,No
3,4,1,Normal,100,100,100,100,No,No,No,No,No,No,No,No
4,5,1,Normal,100,100,100,100,No,No,No,No,No,No,No,No


In [None]:
# We use FFT and time segmentation
# Time segmentation is done by dividing the 1.2s into 3 parts
# essenstially extracting the local pressure maxima (signifying water hammer pressure) in each
# and the corresponding standard deviation which takes into account the 
# Because we know that the valve opens and closes thrice in the 1.2s time window

def get_features(case_file):
    # function for feature engineering one particular csv file like Case001.csv
    features = {}
    for i in range(1,8):
        p_sensor=case_file[f"P{i}"]
        features[f'P{i}_mean']=p_sensor.mean()
        features[f'P{i}_std']=p_sensor.std()
        features[f'P{i}_min']=p_sensor.min()
        features[f'P{i}_max']=p_sensor.max()
        features[f'P{i}_skew']=p_sensor.skew()
        features[f'P{i}_kurtosis']=p_sensor.kurtosis()
        
        #time segmentation:
        segments = [('cycle1', 0, 400), ('cycle2', 400, 800), ('cycle3', 800, 1200)]
        for seg_name, start, end in segments:
            seg = p_sensor.iloc[start:end]
            features[f'P{i}_{seg_name}_max'] = seg.max()
            features[f'P{i}_{seg_name}_std'] = seg.std()
    
        # Adding three more features relating to FFT:
        fft_vals = np.abs(np.fft.fft(p_sensor))[:600] 
        features[f'P{i}_fft_max'] = np.max(fft_vals)
        features[f'P{i}_fft_low'] = np.sum(fft_vals[0:50])  # 0-50 Hz
        features[f'P{i}_fft_high'] = np.sum(fft_vals[50:])  # >50 Hz
    return features

In [21]:
#Looping through the csv files to read and pass to the above defined function to sumarise:

case_features = []

for case_no in labels_df.iloc[:,0]:
    if case_no in range(1,10):
        case_str="00"+str(int(case_no))
    elif case_no in range(10,100):
        case_str="0"+str(int(case_no))
    else:
        case_str=str(int(case_no))
    
    case_file_name="Case"+case_str+".csv"
    case_file_path=f"ME_F321_dataset/train/data/{case_file_name}"
    case_file=pd.read_csv(case_file_path)
    case_features_dict=get_features(case_file)
    name_dict={"Case#":case_no}
    case_features_dict={**name_dict,**case_features_dict}
    case_features.append(case_features_dict)

In [22]:
features_df=pd.DataFrame(case_features)
features_df.head()

Unnamed: 0,Case#,P1_mean,P1_std,P1_min,P1_max,P1_skew,P1_kurtosis,P1_cycle1_max,P1_cycle1_std,P1_cycle2_max,...,P7_kurtosis,P7_cycle1_max,P7_cycle1_std,P7_cycle2_max,P7_cycle2_std,P7_cycle3_max,P7_cycle3_std,P7_fft_max,P7_fft_low,P7_fft_high
0,1,1.984867,0.308519,0.436248,4.228663,1.639029,18.385586,4.226256,0.308483,4.228431,...,7.043883,5.013518,0.580924,5.016796,0.57134,5.017115,0.571395,2368.061353,3304.42457,3826.862829
1,2,1.984931,0.308624,0.436435,4.256737,1.669368,18.490675,4.254314,0.308591,4.256499,...,6.562989,4.99483,0.58199,4.999298,0.580181,4.99933,0.57811,2368.407235,3303.790732,3844.101502
2,3,1.984999,0.308539,0.436793,4.295627,1.682806,18.656679,4.29319,0.308512,4.295373,...,6.151939,4.997008,0.59936,5.000936,0.593205,5.000849,0.593719,2370.797915,3314.309668,3952.136443
3,4,1.984965,0.308392,0.43705,4.295622,1.661384,18.59538,4.293183,0.308366,4.295375,...,7.253035,5.003661,0.573397,5.007759,0.5647,5.008294,0.564926,2367.754936,3307.698526,3786.613952
4,5,1.984987,0.308432,0.437011,4.281213,1.648009,18.44861,4.278777,0.308409,4.280965,...,7.118807,4.993891,0.576191,4.997836,0.567114,4.998044,0.567083,2368.789805,3308.627834,3815.410489


In [25]:
# now merging this with the labels dataset:

training_df = pd.merge(features_df, labels_df, on='Case#')

training_df

Unnamed: 0,Case#,P1_mean,P1_std,P1_min,P1_max,P1_skew,P1_kurtosis,P1_cycle1_max,P1_cycle1_std,P1_cycle2_max,...,SV3,SV4,BP1,BP2,BP3,BP4,BP5,BP6,BP7,BV1
0,1,1.984867,0.308519,0.436248,4.228663,1.639029,18.385586,4.226256,0.308483,4.228431,...,100,100,No,No,No,No,No,No,No,No
1,2,1.984931,0.308624,0.436435,4.256737,1.669368,18.490675,4.254314,0.308591,4.256499,...,100,100,No,No,No,No,No,No,No,No
2,3,1.984999,0.308539,0.436793,4.295627,1.682806,18.656679,4.293190,0.308512,4.295373,...,100,100,No,No,No,No,No,No,No,No
3,4,1.984965,0.308392,0.437050,4.295622,1.661384,18.595380,4.293183,0.308366,4.295375,...,100,100,No,No,No,No,No,No,No,No
4,5,1.984987,0.308432,0.437011,4.281213,1.648009,18.448610,4.278777,0.308409,4.280965,...,100,100,No,No,No,No,No,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,173,1.984983,0.278327,0.543228,4.016765,1.869240,15.538411,4.014407,0.278784,4.016529,...,100,100,No,No,No,Yes,No,No,No,No
173,174,1.984349,0.278177,0.820108,4.063794,2.193004,18.755654,4.061469,0.278336,4.063541,...,100,100,No,No,No,No,Yes,No,No,No
174,175,1.984209,0.274191,0.522027,4.182883,1.751404,21.326472,4.180296,0.274008,4.182612,...,100,100,No,No,No,No,No,Yes,No,No
175,176,1.985246,0.271765,0.579673,4.103252,1.993451,21.082582,4.100687,0.271008,4.102971,...,100,100,No,No,No,No,No,No,Yes,No


In [24]:
# to check for any NaN values:

(training_df==np.nan).sum()

# no NaN values in the dataset



Case#      0
P1_mean    0
P1_std     0
P1_min     0
P1_max     0
          ..
BP4        0
BP5        0
BP6        0
BP7        0
BV1        0
Length: 120, dtype: int64

In [29]:
# Loading the test data:

case_features = []

for case_no in range(178,224):
        case_str=str(int(case_no))
        case_file_name="Case"+case_str+".csv"
        case_file_path=f"ME_F321_dataset/test/data/{case_file_name}"
        case_file=pd.read_csv(case_file_path)
        case_features_dict=get_features(case_file)
        name_dict={"Case#":case_no}
        case_features_dict={**name_dict,**case_features_dict}
        case_features.append(case_features_dict)

In [35]:
test_labels_df=pd.read_excel("ME_F321_dataset/test/labels_spacecraft.xlsx",skiprows=1)
test_labels_df.head()

Unnamed: 0,Case#,Spacecraft#
0,178,1
1,179,1
2,180,1
3,181,1
4,182,1


In [39]:

test_features_df=pd.DataFrame(case_features)
test_df = pd.merge(test_features_df,test_labels_df, on='Case#')

test_df

Unnamed: 0,Case#,P1_mean,P1_std,P1_min,P1_max,P1_skew,P1_kurtosis,P1_cycle1_max,P1_cycle1_std,P1_cycle2_max,...,P7_cycle1_max,P7_cycle1_std,P7_cycle2_max,P7_cycle2_std,P7_cycle3_max,P7_cycle3_std,P7_fft_max,P7_fft_low,P7_fft_high,Spacecraft#
0,178,1.984422,0.300675,0.561754,4.411926,1.955759,19.904115,4.409431,0.300305,4.411653,...,4.957685,0.562416,4.95983,0.544475,4.960276,0.544206,2368.250239,3343.796289,3826.472739,1
1,179,1.984932,0.305923,0.453799,4.103312,1.546036,17.500193,4.100935,0.305777,4.103057,...,5.082534,0.582644,5.085373,0.570766,5.085864,0.570734,2368.53429,3301.727662,3763.237779,1
2,180,1.984972,0.308424,0.436712,4.26276,1.6612,18.461282,4.260331,0.308394,4.262515,...,4.997548,0.573276,5.001972,0.584786,5.000872,0.575337,2361.863225,3330.577159,4100.950659,1
3,181,1.985002,0.308406,0.442751,4.279393,1.703869,18.633265,4.276943,0.308373,4.279134,...,4.990073,0.573094,4.993812,0.58229,4.994317,0.583452,2368.896272,3307.432559,3749.581176,1
4,182,1.984954,0.308387,0.43706,4.270896,1.645871,18.41779,4.268466,0.308359,4.270658,...,4.995626,0.583251,4.998082,0.570672,4.998432,0.568732,2368.837234,3304.822395,3765.596832,1
5,183,1.98489,0.308375,0.436635,4.238379,1.621332,18.314742,4.235972,0.308345,4.238146,...,5.013583,0.577871,5.016722,0.56641,5.017334,0.566291,2367.32172,3305.668812,3791.540989,1
6,184,2.976179,0.325922,1.411117,5.319466,1.017982,16.73355,5.315434,0.325189,5.319051,...,5.721555,0.568949,5.726516,0.57419,5.727131,0.574302,3559.135926,5076.851847,2199.719358,1
7,185,1.984882,0.308438,0.436636,4.233074,1.626207,18.297275,4.230665,0.308406,4.232831,...,5.008386,0.581632,5.010837,0.569793,5.011322,0.56944,2368.818641,3303.993883,3870.942233,1
8,186,1.984192,0.28148,0.525377,4.156134,1.828446,20.282872,4.154228,0.281304,4.155922,...,5.656658,0.600185,5.666543,0.610994,5.667324,0.610785,2365.340871,3383.569228,4868.995948,1
9,187,1.984911,0.308471,0.436629,4.267636,1.655919,18.536173,4.265194,0.30844,4.267377,...,5.010933,0.572459,5.01512,0.563769,5.015618,0.564056,2366.56325,3304.94461,3768.839666,1
