In [1]:
# Libraries to be used 
import pandas as pd
import numpy as np
import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation
%matplotlib inline
sns.set(color_codes=True)



In [2]:
# Import the file into a dataframe 

try:
    aps_failure_set=pd.read_csv(r"\\sedna\jokin.ormazabal$\$Profile\Desktop\Master\Data Prep\aps_failure_set.csv")
    print("APS Failure Set successfully imported into a dataframe")
except:
    print("Something went wrong importing the APS Failure Set")

#As comma is the defaulted delimiter, we dont need to specify it

#try to do it trough a url



APS Failure Set successfully imported into a dataframe


In [3]:
#Based on the explanation received, we know the data divides between the errors relating APS and not, 
#so we divide the main df into 2 different ones.

In [4]:
aps_failures=aps_failure_set.loc[aps_failure_set['class']=='pos']
not_aps_failures=aps_failure_set.loc[aps_failure_set['class']=='neg']

In [5]:
# EXPLANATORY DATA ANALYSIS FOR APS RELATED FAILURES

In [6]:
 # Check for null and missing values

na_values_aps_failures = aps_failures.isna().sum()
na_aps = na_values_aps_failures.to_numpy()

if np.sum(na_aps) == 0:
    print('The APS failures dataset has NO na values')
else:
    print('The APS failures dataset has na values')



The APS failures dataset has NO na values


In [7]:
# We know by looking at the data that it actually has na values but the function is not catching them. 
# This happens because the na is a string and for python the value is actually not null. 
# It is important to always manually review the data and check that the output from the code make sense.

In [8]:
# We will now look for values equal to 'na'

na_values_aps_failures=aps_failures.isin(['na']) #creates a df with True for values=='na' & false for values !='na'

if (na_values_aps_failures.any().any())==True: 
    print('The APS failures dataset has values equal to na')
else:
     print('The APS failures dataset has NO values equal to na')

The APS failures dataset has values equal to na


In [9]:
# As we suspected, we actually have null values. 
# These values will need to be excluded from any statistical analysis and python has ways to do so. 
#The best method will be to convert those na values into propper NaN's so that the code recognises them latter.

In [10]:
aps_failures=aps_failures.replace("na",np.nan) 
aps_failures=aps_failures.replace("",np.nan) # In case there are any blank values, we convert them as NaN as well

na_aps = na_values_aps_failures.to_numpy()

if np.sum(na_aps) == 0:
    print('The APS failures dataset has NO na values')
else:
    print('The APS failures dataset has na values')



The APS failures dataset has na values


In [11]:
# Now that we identified the existance of NaN values we will contextualize them and decide if we can drop them. 

In [12]:
aps_failures.shape 

(1000, 171)

In [13]:
# We know that we have 1000 rows and 171 columns. So now It will interesting to see which columns have a great % of nulls 

In [14]:
null_percentages=(aps_failures.isnull().sum()/1000*100)
null_percentages = aps_failures.isnull().mean() * 100 # we can also do it with the mean attribute

In [68]:
High_null_aps_failures = pd.DataFrame(null_percentages, columns=['High_Null']) #Data frame with the % of null values by column
print (High_null_aps_failures)

        High_Null
class         0.0
aa_000        0.0
ab_000       77.1
ac_000       46.2
ad_000       64.5
...           ...
ee_007        0.5
ee_008        0.5
ee_009        0.5
ef_000       37.7
eg_000       37.7

[171 rows x 1 columns]


In [25]:
High_null_aps_failures.reset_index(drop=False, inplace=True) #I need to add an index as the sensor names where not counting as a propper column

In [None]:
# print (High_null_aps_failures)

In [26]:
High_null_aps_failures=High_null_aps_failures[High_null_aps_failures.iloc[:,1]>20] #Select the sensors with greater than 20% null values

High_null_aps_failures.shape

(60, 2)

In [None]:
# We see that there are 60 sensors with more than 20% of null values. I consider that number too high to analyse them

# I'll drop all those columns to focus only on the most reliable data.

In [27]:
faulty_sensors=High_null_aps_failures.iloc[:,0].tolist() #get the name of the faulty sensors on a list 

In [28]:
aps_failures=aps_failures.drop(faulty_sensors,axis=1)
aps_failures.shape

(1000, 111)

In [None]:
# We droped what I considered faulty sensors by the high null values, but still we have nulls on the other sensors.
# The existance of this nulls makes the sample incosistant as we have more records for some sensors. 
# We will drop the rows with null values to make the data set equal for every sensor type.


In [33]:
aps_failures = aps_failures.dropna()
aps_failures.shape

(861, 110)

In [30]:
#We have cleaned our dataset from nulls and "faulty" data, but still we have a high number of different sensors to analyse

#To make a more in depth analysis, I will select the top 5 sensors that their failures are related to an APS failure.

In [34]:
aps_failures.set_index(aps_failures.columns[0],inplace=True) # To calculate the mean I need to exclude the first column, so I'll make it the index

aps_failures=aps_failures.astype(float) # Convert all the df to float type. It got converted to int with so many cleaning

In [54]:
Failures_mean=aps_failures.mean()
Top_aps_failures=pd.DataFrame(Failures_mean,columns=['Mean'])

In [55]:
greatest_values = Top_aps_failures['Mean'].nlargest(20) #Select the top 20 largest
print(greatest_values)

bb_000    5.313761e+07
bv_000    5.313761e+07
cq_000    5.313761e+07
bu_000    5.313761e+07
bx_000    4.548410e+07
cc_000    4.274224e+07
ci_000    4.026532e+07
an_000    3.721372e+07
ao_000    3.218488e+07
az_005    2.435005e+07
cs_005    2.206840e+07
ah_000    1.992681e+07
bg_000    1.992681e+07
ba_000    1.438220e+07
ap_000    1.418936e+07
cn_004    1.394432e+07
ay_008    1.384981e+07
ay_007    1.372521e+07
az_004    1.336726e+07
ag_006    1.331767e+07
Name: Mean, dtype: float64


In [64]:
df_melted = aps_failures.melt(var_name='Test', value_name='Number of Errors')


In [69]:
print(df_melted)

         Test  Number of Errors
0      ag_002             222.0
1      ag_002          184552.0
2      ag_002               0.0
3      ag_002          921256.0
4      ag_002              30.0
...       ...               ...
92122  ee_009           10790.0
92123  ee_009            2458.0
92124  ee_009               0.0
92125  ee_009               0.0
92126  ee_009               0.0

[92127 rows x 2 columns]


In [None]:
greatest_values = df_melted['Number of Errors'].nlargest(20)

max_value = greatest_values.max() #Get the maximum number of errors
print(max_value)

min_value = greatest_values.min() #Get the minimum number of errors

print(min_value)

In [102]:
df_melted.Test.value_counts().nlargest(20).plot(kind='bar', figsize=(20,5))
plt.title("Number of errors by sensor type")
plt.ylabel("Number of errors")
plt.xlabel("Sensor type")

plt.yticks(range(152457314, 192871534, 1000))

plt.show()

KeyboardInterrupt: 