#### Import some important libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Import my dataset and look at it

In [3]:
file_path = 'data/aps_failure_set.csv'
data_data_df = pd.read_csv(file_path)
data_data_df.head()


Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,76698,na,2130706438,280,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,neg,33058,na,0,na,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,neg,41040,na,228,100,0,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,neg,12,0,70,66,0,10,0,0,0,...,240,46,58,44,10,0,0,0,4,32
4,neg,60874,na,1368,458,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0


#### Check for missing values

In [24]:
corrupt_values = ["NA", "N/A", "-", "NaN", "missing", "na", "?"]
data_data_df = pd.read_csv('data/aps_failure_set.csv', na_values=corrupt_values)
missing_values = data_data_data_data_df.isnull().sum()
print(missing_values)

class         0
aa_000        0
ab_000    46329
ac_000     3335
ad_000    14861
          ...  
ee_007      671
ee_008      671
ee_009      671
ef_000     2724
eg_000     2723
Length: 171, dtype: int64


#### Import library useful in order to decide how to clean 

In [18]:
from ydata_profiling import ProfileReport

#### Generate  "my_report.html" useful to decide how to clean 

In [28]:
slice_data_df = data_data_df.iloc[:, :10]
report = ProfileReport(data_data_df, title='My Data', minimal=True)
report.to_file("my_report.html")

Summarize dataset: 100%|██████████| 176/176 [00:06<00:00, 28.05it/s, Completed]               
Generate report structure: 100%|██████████| 1/1 [02:14<00:00, 134.51s/it]
Render HTML: 100%|██████████| 1/1 [00:08<00:00,  8.78s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 20.72it/s]


#### Remove Constant Columns

In [26]:
if data_data_df['cd_000'].nunique() == 1:
    data_data_df.drop(columns=['cd_000'], inplace=True)

#### Handle Missing Values
###### I apply the same data preprocessing steps to multiple features in my dataset that share similar characteristics of cn_002 (e.g., a mix of missing values 1.1% and zero values 50.9%), I create a function to automate the process

In [30]:
def preprocess_feature(data_df, feature_name):
    # Impute missing values with the median of non-zero values
    median_nonzero_values = data_df[feature_name][data_df[feature_name] != 0].median()
    data_df[feature_name].fillna(median_nonzero_values, inplace=True)

    # Create a binary indicator for missing values
    data_df[f'{feature_name}_missing'] = data_df[feature_name].isnull().astype(int)

    # Create a binary indicator for zero values
    data_df[f'{feature_name}_zero'] = (data_df[feature_name] == 0).astype(int)

    # Feature engineering: you can create additional features or aggregations
    # For example, you can calculate the square root of the feature
    data_df[f'{feature_name}_sqrt'] = np.sqrt(data_df[feature_name])

    # Drop the original feature column if it's no longer needed
    data_df.drop(columns=[feature_name], inplace=True)

# Assuming you've already loaded your dataset into a DataFrame named 'data_data_df'
# and you have a list of feature names to process, e.g., feature_list
for feature_name in feature_list:
    preprocess_feature(data_data_df, feature_name)

# Now, the specified preprocessing steps have been applied to all features in 'feature_list'


NameError: name 'feature_list' is not defined

## Address Skewed Data
###### For highly skewed numeric features, consider applying data transformations to make the distribution more normal. Common transformations include log transformations or box-cox transformations. Another approach is to use robust models that are less sensitive to skewed data, like tree-based algorithms