In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sb

In [20]:
Train_path = "../datasets/ApsFailure/aps_failure_training_set_processed_8bit.csv"
Test_path = "../datasets/ApsFailure/aps_failure_test_set_processed_8bit.csv"
train_df = pd.read_csv(Train_path)
test_df = pd.read_csv(Test_path)

In [21]:
print(train_df.shape)
print(test_df.shape)

(60000, 171)
(16000, 171)


In [22]:
train_df.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,-0.992188,0.117188,-0.289062,0.992188,-0.007812,-0.046875,-0.054688,-0.007812,-0.03125,-0.054688,...,0.6875,0.515625,0.234375,0.070312,0.007812,-0.109375,-0.140625,-0.171875,-0.023438,-0.023438
1,-0.992188,-0.179688,-0.289062,-0.46875,-0.007812,-0.046875,-0.054688,-0.007812,-0.03125,-0.054688,...,-0.023438,-0.0625,-0.132812,-0.132812,-0.1875,-0.148438,-0.085938,-0.140625,-0.023438,-0.023438
2,-0.992188,-0.125,-0.289062,-0.46875,-0.007812,-0.046875,-0.054688,-0.007812,-0.03125,-0.054688,...,-0.140625,-0.09375,-0.015625,0.015625,-0.007812,-0.109375,-0.09375,-0.164062,-0.023438,-0.023438
3,-0.992188,-0.40625,-0.289062,-0.46875,-0.007812,-0.046875,-0.007812,-0.007812,-0.03125,-0.054688,...,-0.382812,-0.382812,-0.375,-0.351562,-0.3125,-0.195312,-0.304688,-0.171875,0.890625,0.992188
4,-0.992188,0.007812,-0.289062,-0.46875,-0.007812,-0.046875,-0.054688,-0.007812,-0.03125,-0.054688,...,0.15625,0.03125,-0.03125,-0.039062,-0.046875,-0.015625,0.65625,-0.148438,-0.023438,-0.023438


In [23]:
test_df.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,-0.992188,-0.40625,-0.289062,-0.46875,-0.007812,-0.046875,-0.054688,-0.007812,-0.03125,-0.054688,...,-0.382812,-0.382812,-0.375,-0.351562,-0.3125,-0.195312,-0.304688,-0.171875,-0.023438,-0.023438
1,-0.992188,-0.40625,-0.289062,-0.46875,-0.007812,-0.046875,-0.054688,-0.007812,-0.03125,-0.054688,...,-0.382812,-0.382812,-0.375,-0.351562,-0.3125,-0.195312,-0.304688,-0.171875,-0.023438,-0.023438
2,-0.992188,0.046875,0.554688,-0.46875,-0.007812,-0.046875,-0.054688,-0.007812,-0.03125,-0.054688,...,0.046875,0.3125,-0.0,-0.109375,0.914062,-0.109375,-0.304688,-0.171875,-0.023438,-0.023438
3,-0.992188,0.0,-0.289062,-0.46875,-0.007812,-0.046875,-0.054688,-0.007812,-0.03125,-0.054688,...,0.085938,0.0625,0.03125,0.085938,0.09375,-0.078125,0.320312,-0.109375,-0.023438,-0.023438
4,-0.992188,-0.390625,-0.289062,-0.46875,-0.007812,-0.046875,-0.054688,-0.007812,-0.03125,-0.054688,...,-0.375,-0.375,-0.359375,-0.304688,-0.304688,-0.195312,-0.304688,-0.171875,-0.023438,-0.023438


In [24]:
print(train_df.isnull().sum().sum(), train_df.isna().sum().sum())

0 0


Data has no null values and no "na" values.

In [25]:
print(train_df.columns)

Index(['class', 'aa_000', 'ab_000', 'ac_000', 'ad_000', 'ae_000', 'af_000',
       'ag_000', 'ag_001', 'ag_002',
       ...
       'ee_002', 'ee_003', 'ee_004', 'ee_005', 'ee_006', 'ee_007', 'ee_008',
       'ee_009', 'ef_000', 'eg_000'],
      dtype='object', length=171)


In [32]:
list_dtype = [train_df[i].dtype for i in train_df.columns]
dict((i, list_dtype.count(i)) for i in list_dtype)

{dtype('float64'): 171}

All the columns are float type. Hence one hot encoding is not required.

In [12]:
print(train_df['class'].value_counts())

-0.992188    59000
 0.992188     1000
Name: class, dtype: int64


This shows the class imbalance

In [33]:
for col in train_df.columns:
    if len(train_df[col].unique()) < 5:
        print(col)
        print(train_df.groupby([col, 'class'])['class'].count())
        # break

class
class      class    
-0.992188  -0.992188    59000
 0.992188   0.992188     1000
Name: class, dtype: int64
ad_000
ad_000     class    
-0.007812  -0.992188    58646
            0.992188      903
-0.000000  -0.992188      352
            0.992188       97
 0.015625  -0.992188        1
 0.992188  -0.992188        1
Name: class, dtype: int64
cf_000
cf_000     class    
-0.007812  -0.992188    58826
            0.992188      901
-0.000000  -0.992188      173
            0.992188       99
 0.992188  -0.992188        1
Name: class, dtype: int64
ch_000
ch_000     class    
-0.015625  -0.992188    58988
            0.992188      998
 0.992188  -0.992188       12
            0.992188        2
Name: class, dtype: int64
co_000
co_000     class    
-0.007812  -0.992188    58401
            0.992188      892
-0.000000  -0.992188      598
            0.992188      107
 0.007812   0.992188        1
 0.992188  -0.992188        1
Name: class, dtype: int64
