In [89]:
%load_ext kedro.ipython

The kedro.ipython extension is already loaded. To reload it, use:
  %reload_ext kedro.ipython


In [90]:
import pandas as pd
import numpy as np

In [91]:
df = catalog.load("cancer")

In [92]:
# see all columns in pandas
pd.set_option('display.max_columns', None)

In [93]:
df

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,17.99,10.38,122.80,1001.0,0.11840,0
1,20.57,17.77,132.90,1326.0,0.08474,0
2,19.69,21.25,130.00,1203.0,0.10960,0
3,11.42,20.38,77.58,386.1,0.14250,0
4,20.29,14.34,135.10,1297.0,0.10030,0
...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0
565,20.13,28.25,131.20,1261.0,0.09780,0
566,16.60,28.08,108.30,858.1,0.08455,0
567,20.60,29.33,140.10,1265.0,0.11780,0


See basic features of the dataset

In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   mean_radius      569 non-null    float64
 1   mean_texture     569 non-null    float64
 2   mean_perimeter   569 non-null    float64
 3   mean_area        569 non-null    float64
 4   mean_smoothness  569 non-null    float64
 5   diagnosis        569 non-null    int64  
dtypes: float64(5), int64(1)
memory usage: 26.8 KB


In [95]:
df.describe()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
count,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.627417
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.483918
min,6.981,9.71,43.79,143.5,0.05263,0.0
25%,11.7,16.17,75.17,420.3,0.08637,0.0
50%,13.37,18.84,86.24,551.1,0.09587,1.0
75%,15.78,21.8,104.1,782.7,0.1053,1.0
max,28.11,39.28,188.5,2501.0,0.1634,1.0


See null values

In [96]:
df.isnull().sum()


mean_radius        [1;36m0[0m
mean_texture       [1;36m0[0m
mean_perimeter     [1;36m0[0m
mean_area          [1;36m0[0m
mean_smoothness    [1;36m0[0m
diagnosis          [1;36m0[0m
dtype: int64

In [97]:
from copy import deepcopy

In [98]:
df_final = deepcopy(df)

1. It is going to be created a function to make feature engineering with the Timestamp information

In [99]:
def time_feature_eng(df: pd.DataFrame) -> pd.DataFrame:
    """
    This function extracts time features from the Timestamp column.

    Args:
        df: pd.DataFrame - the input DataFrame
    
    Returns:
        pd.DataFrame - the DataFrame with additional time features
    """
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df['Hour'] = df['Timestamp'].dt.hour
    df['Minute'] = df['Timestamp'].dt.minute
    df['Seconds'] = df['Timestamp'].dt.second
    df['Month'] = df['Timestamp'].dt.month
    df['Day'] = df['Timestamp'].dt.day
    df['Weekday'] = df['Timestamp'].dt.weekday
    return df

In [100]:
# df_final = time_feature_eng(df_final)

2. It is going to be created a function to fill all null values

In [101]:
def process_nulls(df: pd.DataFrame) -> pd.DataFrame:
    """
    This function fills null values in the DataFrame.

    Args:
        df: pd.DataFrame - the input DataFrame
    
    Returns:
        pd.DataFrame - the DataFrame with filled null values
    """
    if df_final.isnull().sum().sum() > 0:
        df = df.fillna(0)
        return df
    else:
        return df

In [102]:
df_final = process_nulls(df_final)

3. Hypothesis test between target and feature variables

In [103]:
from scipy import stats

def anova_test_filtering(df: pd.DataFrame, target_col: str) -> pd.DataFrame:
    """
    This function performs ANOVA test to select the most important features.

    Args:
        df: pd.DataFrame - the input DataFrame
        target_col: str - the target column

    Returns:
        pd.DataFrame - the DataFrame with selected features
    """
    #useful columns
    useful_columns = []

    #obtener las columnas numericas
    numeric_columns = list(df_final.select_dtypes(include=['float', 'int']).columns)

    for col in numeric_columns:

        groups = df.groupby(target_col)[col].apply(list)
        f_statistic, p_value = stats.f_oneway(*groups)
        if p_value < 0.05:
            useful_columns.append(col)
    return df[useful_columns]

In [104]:
df_final = anova_test_filtering(df=df_final, target_col='diagnosis')

4. Make a one-hot encoding for categorical features

In [105]:
from sklearn.preprocessing import LabelEncoder

In [106]:
def encode_categoricals(df: pd.DataFrame, target_col: str) -> pd.DataFrame:
    """
    This function encodes categorical columns in the DataFrame.

    Args:
        df: pd.DataFrame - the input DataFrame
        target_col: str - the name of the target column
    
    Returns:
        pd.DataFrame - the DataFrame with encoded categorical columns
    """
    categorical_columns = list(df_final.select_dtypes(include=['object']).columns)

    if target_col in categorical_columns:
        categorical_columns.remove(target_col)
        
    if len(categorical_columns) > 0:
        for col in categorical_columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[[col]])
        return df
    else:
        return df

In [107]:
df_final = encode_categoricals(df_final, target_col='diagnosis')

5. Normalize features

In [108]:
from sklearn.preprocessing import StandardScaler

In [109]:
def scale_columns(df: pd.DataFrame, target_col: str) -> pd.DataFrame:
    """
    This function scales all columns in the DataFrame.

    Args:
        df: pd.DataFrame - the input DataFrame
        target_col: str - the target column to avoid scaling
    
    Returns:
        pd.DataFrame - the DataFrame with scaled columns
    """
    numerical_columns = list(df_final.select_dtypes(include=['float', 'int']).columns)
    
    if target_col in numerical_columns:
        numerical_columns.remove(target_col)

    if len(numerical_columns) > 0:
        for col in numerical_columns:
            scaler = StandardScaler()
            df[col] = scaler.fit_transform(df[[col]])
        return df
    else:
        return df

In [110]:
df_final = scale_columns(df_final, target_col='diagnosis')