# APS Failure Prediction


The task is to device a prediction model for judging whether or not a vehicle faces imminent failure of APS system .

The dataset consists of data collected from heavy Scania trucks in everyday usage. The Air Pressure System (APS) is a type of function used in heavy vehicles to assist braking and gear changing. The APS failure dataset consists of the daily operational sensor data from failed Scania trucks. The dataset is crucial to the manufacturer as it allows to isolate components which caused the failure.

In [31]:
# import pandas and numpy lib packages for data manipulation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# import Sci-kit machine learning packages
# for data splitting and scaling
from sklearn.model_selection import train_test_split

# for model evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn import metrics

# machine learning methods
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [32]:
aps_train_df=pd.read_csv('aps_failure_training_set.csv',header='infer',skiprows=20)
aps_test_df=pd.read_csv('aps_failure_test_set.csv',header='infer',skiprows=20)

In [33]:
aps_train_df.shape,aps_test_df.shape

((60000, 171), (16000, 171))

In [34]:
data=pd.concat([aps_train_df,aps_test_df])
data.head()

In [35]:
data.shape

(76000, 171)

In [36]:
data.replace('na',np.nan,inplace=True)

In [37]:
# replace 'neg' with 0 and 'pos' with 1 in 'Class column'
data['class'].replace( {'neg': 0, 'pos': 1}, inplace=True)
data.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,0,76698,,2130706438,280.0,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,0,33058,,0,,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,0,41040,,228,100.0,0,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,0,12,0.0,70,66.0,0,10,0,0,0,...,240,46,58,44,10,0,0,0,4,32
4,0,60874,,1368,458.0,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0


In [38]:
# Calculating the percentage of values missing in each column of our dataset
percent_missing= (data.isnull().sum() * 100) / len(data)
idx= np.argsort(-percent_missing)     #sorting the percent values in descending order
columns= data.columns
sorted(percent_missing)[:158:-1]

[82.09605263157894,
 81.18815789473685,
 79.55394736842105,
 77.24868421052632,
 77.22631578947369,
 77.22631578947369,
 73.31842105263158,
 65.91447368421052,
 45.39868421052632,
 38.32631578947368,
 24.792105263157893,
 24.792105263157893]

In [39]:
col = np.take(columns, idx[0:8])
print(col)

Index(['br_000', 'bq_000', 'bp_000', 'bo_000', 'cr_000', 'ab_000', 'bn_000',
       'bm_000'],
      dtype='object')


In [40]:
data = data.drop(col, axis=1)           # dropping the columns which have missing percent more than 50%

In [41]:
data.head()

Unnamed: 0,class,aa_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,ag_003,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,0,76698,2130706438,280.0,0,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,0,33058,0,,0,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,0,41040,228,100.0,0,0,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,0,12,70,66.0,0,10,0,0,0,318,...,240,46,58,44,10,0,0,0,4,32
4,0,60874,1368,458.0,0,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0


In [42]:
columns=data.columns

In [43]:
data = data.fillna(0)

In [44]:
data.head()

Unnamed: 0,class,aa_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,ag_003,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,0,76698,2130706438,280,0,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,0,33058,0,0,0,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,0,41040,228,100,0,0,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,0,12,70,66,0,10,0,0,0,318,...,240,46,58,44,10,0,0,0,4,32
4,0,60874,1368,458,0,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0


In [45]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') #using mean imputations

In [46]:
imputer = imputer.fit(data)                                   #fitting the imputer on the train data
data= imputer.transform(data)                        #transforming on the train data and test data

In [47]:
df = pd.DataFrame(data, columns=columns)     #preparing new dataframe with imputations and undropped attributes

In [48]:
df.head()

Unnamed: 0,class,aa_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,ag_003,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,0.0,76698.0,2130706000.0,280.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1240520.0,493384.0,721044.0,469792.0,339156.0,157956.0,73224.0,0.0,0.0,0.0
1,0.0,33058.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,421400.0,178064.0,293306.0,245416.0,133654.0,81140.0,97576.0,1500.0,0.0,0.0
2,0.0,41040.0,228.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,...,277378.0,159812.0,423992.0,409564.0,320746.0,158022.0,95128.0,514.0,0.0,0.0
3,0.0,12.0,70.0,66.0,0.0,10.0,0.0,0.0,0.0,318.0,...,240.0,46.0,58.0,44.0,10.0,0.0,0.0,0.0,4.0,32.0
4,0.0,60874.0,1368.0,458.0,0.0,0.0,0.0,0.0,0.0,0.0,...,622012.0,229790.0,405298.0,347188.0,286954.0,311560.0,433954.0,1218.0,0.0,0.0


In [49]:
df['class'].unique()

array([0., 1.])

In [50]:
df["class"].value_counts()

0.0    74625
1.0     1375
Name: class, dtype: int64

In [51]:
X=df.drop('class',axis=1)
y=df['class']

splitting the data into training and testing sets

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.70, random_state=42)

# 

In [53]:
from sklearn.naive_bayes import GaussianNB 
model = GaussianNB()

In [54]:
model.fit(X_train,y_train)

GaussianNB()

In [55]:
y_pred = model.predict(X_test)

In [56]:
from sklearn.metrics import classification_report #defining the final classification report 

print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

         0.0       0.97      1.00      0.98     50787
         1.0       0.89      0.36      0.51      2413

    accuracy                           0.97     53200
   macro avg       0.93      0.68      0.75     53200
weighted avg       0.97      0.97      0.96     53200



In [57]:
from sklearn.svm import SVC
model =SVC()

In [58]:
model.fit(X_train,y_train)

SVC()

In [59]:
y_pred = model.predict(X_test)

In [60]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

         0.0       1.00      0.98      0.99     53187
         1.0       0.01      0.69      0.02        13

    accuracy                           0.98     53200
   macro avg       0.50      0.84      0.50     53200
weighted avg       1.00      0.98      0.99     53200

