**Understand the Data:**

In [39]:
import pandas as pd

# Load the dataset
data = pd.read_csv('data/ServivalRate.csv')

# Display the first few rows to get a sense of the data
print(data.head())

# Get information about the columns and data types
print(data.info())

# Check the shape of the dataset (number of rows and columns)
print(data.shape)


         age  death     sex  hospdead  slos  d.time            dzgroup  \
0  85.655945      1    male         0    12      63        Lung Cancer   
1  42.258972      1  female         0     8     370       Colon Cancer   
2  43.539978      0  female         0   115    2022  ARF/MOSF w/Sepsis   
3  45.417999      1    male         0     7     827        Lung Cancer   
4  63.662994      1  female         1    14      14  ARF/MOSF w/Sepsis   

    dzclass  num.co   edu  ...      crea  sod        ph  glucose  bun  urine  \
0    Cancer       2  12.0  ...  1.000000  143  7.449219      NaN  NaN    NaN   
1    Cancer       0  11.0  ...  0.799927  139       NaN      NaN  NaN    NaN   
2  ARF/MOSF       1   NaN  ...  0.599976  134  7.399414      NaN  NaN    NaN   
3    Cancer       2   NaN  ...  1.099854  137  7.489258      NaN  NaN    NaN   
4  ARF/MOSF       0  22.0  ...  2.899902  130  7.449219      NaN  NaN    NaN   

  adlp  adls             sfdm2     adlsc  
0  NaN   7.0               NaN 

In [40]:
# Check for missing values in each column
missing_values = data.isnull().sum()
print(missing_values)


age            0
death          0
sex            0
hospdead       0
slos           0
d.time         0
dzgroup        0
dzclass        0
num.co         0
edu          404
income       698
scoma          0
charges       50
totcst       210
totmcst      744
avtisst       12
race          10
meanbp         0
wblc          48
hrt            0
resp           0
temp           0
pafi         506
alb          756
bili         594
crea           6
sod            0
ph           500
glucose      940
bun          910
urine       1034
adlp        1268
adls         620
sfdm2        318
adlsc          0
dtype: int64


**Data Summaries:**

In [41]:
# Basic statistics for numerical columns
numerical_summary = data.describe()
print(numerical_summary)


               age        death    hospdead         slos       d.time  \
count  2000.000000  2000.000000  2000.00000  2000.000000  2000.000000   
mean     62.473128     0.668000     0.25300    17.855000   475.697000   
std      16.110197     0.471049     0.43484    22.153089   554.660496   
min      18.041992     0.000000     0.00000     3.000000     3.000000   
25%      51.810989     0.000000     0.00000     6.000000    27.000000   
50%      64.896484     1.000000     0.00000    11.000000   256.500000   
75%      74.498215     1.000000     1.00000    20.000000   725.000000   
max     101.847961     1.000000     1.00000   241.000000  2029.000000   

          num.co          edu        scoma        charges         totcst  ...  \
count  2000.0000  1596.000000  2000.000000    1950.000000    1790.000000  ...   
mean      1.8860    11.775689    11.743000   56271.208165   30489.924247  ...   
std       1.3483     3.606133    24.311818   83333.136369   43454.208112  ...   
min       0.0000  

In [42]:

# Set the threshold for null values (e.g., 15%)
null_threshold = 0.15

# Calculate the percentage of null values for each column
null_percentages = data.isnull().mean()

# Filter columns based on the null threshold
columns_to_drop = null_percentages[null_percentages > null_threshold].index.tolist()

# Drop selected columns and return the modified DataFrame
data = data.drop(columns=columns_to_drop)

# Display the modified DataFrame
print(data)


            age  death     sex  hospdead  slos  d.time            dzgroup  \
0     85.655945      1    male         0    12      63        Lung Cancer   
1     42.258972      1  female         0     8     370       Colon Cancer   
2     43.539978      0  female         0   115    2022  ARF/MOSF w/Sepsis   
3     45.417999      1    male         0     7     827        Lung Cancer   
4     63.662994      1  female         1    14      14  ARF/MOSF w/Sepsis   
...         ...    ...     ...       ...   ...     ...                ...   
1995  76.727966      1    male         0     4     242        Lung Cancer   
1996  18.414993      0    male         0     9     380  ARF/MOSF w/Sepsis   
1997  74.619995      0    male         0     9     361                CHF   
1998  75.947998      0    male         0    13     359  ARF/MOSF w/Sepsis   
1999  18.041992      0  female         0    12     358       MOSF w/Malig   

                 dzclass  num.co  scoma  ...  avtisst      race  meanbp  \


In [43]:
data.columns

Index(['age', 'death', 'sex', 'hospdead', 'slos', 'd.time', 'dzgroup',
       'dzclass', 'num.co', 'scoma', 'charges', 'totcst', 'avtisst', 'race',
       'meanbp', 'wblc', 'hrt', 'resp', 'temp', 'crea', 'sod', 'adlsc'],
      dtype='object')

In [44]:
len(data.columns)

22

sex-male,female
dzgroup-ARF/MOSF w/Sepsis,CHF,Lung Cancer ,MOSF w/Malig,
Coma                 120
Cirrhosis            110
Colon Cancer          98

In [49]:
categorical_columns = data.select_dtypes(include=['object']).columns

unique_values_dict = {}
for column in categorical_columns:
    unique_values_dict[column] = data[column].unique()

# Display or print the unique values for each categorical column
for column, unique_values in unique_values_dict.items():
    print(f"Unique values for '{column}': {unique_values}")


Unique values for 'sex': ['male' 'female']
Unique values for 'dzgroup': ['Lung Cancer' 'Colon Cancer' 'ARF/MOSF w/Sepsis' 'MOSF w/Malig'
 'Cirrhosis' 'CHF' 'COPD' 'Coma']
Unique values for 'dzclass': ['Cancer' 'ARF/MOSF' 'COPD/CHF/Cirrhosis' 'Coma']
Unique values for 'race': ['black' 'hispanic' 'white' 'other' 'asian' nan]


In [50]:
numerical_columns = data.select_dtypes(include='number').columns

# Display or print the numerical columns
print("Numerical columns:", numerical_columns)


Numerical columns: Index(['age', 'death', 'hospdead', 'slos', 'd.time', 'num.co', 'scoma',
       'charges', 'totcst', 'avtisst', 'meanbp', 'wblc', 'hrt', 'resp', 'temp',
       'crea', 'sod', 'adlsc'],
      dtype='object')


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

In [25]:
import logging
from abc import ABC, abstractmethod
from typing import Union

In [26]:
def handle_data(data: pd.DataFrame) -> pd.DataFrame:
        """
        Preprocessing data
        """
        try:
            # Set the threshold for null values (e.g., 15%)
            null_threshold = 0.15

            # Calculate the percentage of null values for each column
            null_percentages = data.isnull().mean()

            # Filter columns based on the null threshold
            columns_to_drop = null_percentages[null_percentages > null_threshold].index.tolist()

            # Drop selected columns and return the modified DataFrame
            data = data.drop(columns=columns_to_drop)

            data["race"].fillna(data["race"].mode(),inplace=True)
            data["charges"].fillna(data["charges"].median(),inplace=True)
            data["totcst"].fillna(data["totcst"].median(),inplace=True)
            data["avtisst"].fillna(data["avtisst"].median(),inplace=True)
            data["wblc"].fillna(data["wblc"].median(),inplace=True)
            data["crea"].fillna(data["crea"].median(),inplace=True)  
            data = data.drop(
                [
                    "hospdead"
                   
                ],axis=1
            )
            data=data.dropna()
            # le = LabelEncoder()
            # data['sex'] = le.fit_transform(data['sex'])
            # data['dzgroup'] = le.fit_transform(data['dzgroup'])
            # data['dzclass'] = le.fit_transform(data['dzclass'])
            # data['race'] = le.fit_transform(data['race'])
            # return data
    
        except Exception as e:
            logging.error("Error in preprocessing data: {}".format(e))
            raise e


In [27]:
data=handle_data(data)

In [28]:
data.columns

Index(['age', 'death', 'sex', 'slos', 'd.time', 'dzgroup', 'dzclass', 'num.co',
       'scoma', 'charges', 'totcst', 'avtisst', 'race', 'meanbp', 'wblc',
       'hrt', 'resp', 'temp', 'crea', 'sod', 'adlsc'],
      dtype='object')

In [31]:
column_to_drop = 'death'
data = data.drop(columns=[column_to_drop])

In [32]:
data.columns

Index(['age', 'sex', 'slos', 'd.time', 'dzgroup', 'dzclass', 'num.co', 'scoma',
       'charges', 'totcst', 'avtisst', 'race', 'meanbp', 'wblc', 'hrt', 'resp',
       'temp', 'crea', 'sod', 'adlsc'],
      dtype='object')

In [13]:
columns_for_df = [
          'age', 
          'sex', 
          'slos', 
          'd.time', 
          'dzgroup', 
          'dzclass', 
          'num.co',
          'scoma', 
          'charges',
            'totcst', 
            'avtisst', 
            'race', 
            'meanbp',
            'wblc',
            'hrt', 
            'resp', 
            'temp',
            'crea', 
            'sod', 
            'adlsc'
        
    ]
data=['age',  'sex', 'slos', 'd.time', 'dzgroup', 'dzclass', 'num.co',
       'scoma', 'charges', 'totcst', 'avtisst', 'race', 'meanbp', 'wblc',
       'hrt', 'resp', 'temp', 'crea', 'sod', 'adlsc']

In [14]:
columns_for_df==data

True

In [33]:
categorical_columns = data.select_dtypes(include=['object']).columns
print("Categorical Columns:")
print(categorical_columns)


Categorical Columns:
Index([], dtype='object')


In [20]:
columns_with_missing_values = data.columns[data.isnull().any()]

print("Columns with Missing Values:")
print(columns_with_missing_values)

Columns with Missing Values:
Index(['charges', 'totcst', 'avtisst', 'race', 'wblc', 'crea'], dtype='object')


In [21]:
data = data.drop(
                [
                    "hospdead"
                   
                ],axis=1
            )

In [22]:
data.columns

Index(['age', 'death', 'sex', 'slos', 'd.time', 'dzgroup', 'dzclass', 'num.co',
       'scoma', 'charges', 'totcst', 'avtisst', 'race', 'meanbp', 'wblc',
       'hrt', 'resp', 'temp', 'crea', 'sod', 'adlsc'],
      dtype='object')

In [23]:
print(data)


            age  death     sex  slos  d.time            dzgroup  \
0     85.655945      1    male    12      63        Lung Cancer   
1     42.258972      1  female     8     370       Colon Cancer   
2     43.539978      0  female   115    2022  ARF/MOSF w/Sepsis   
3     45.417999      1    male     7     827        Lung Cancer   
4     63.662994      1  female    14      14  ARF/MOSF w/Sepsis   
...         ...    ...     ...   ...     ...                ...   
1995  76.727966      1    male     4     242        Lung Cancer   
1996  18.414993      0    male     9     380  ARF/MOSF w/Sepsis   
1997  74.619995      0    male     9     361                CHF   
1998  75.947998      0    male    13     359  ARF/MOSF w/Sepsis   
1999  18.041992      0  female    12     358       MOSF w/Malig   

                 dzclass  num.co  scoma   charges  ...  avtisst      race  \
0                 Cancer       2     26       NaN  ...     8.50     black   
1                 Cancer       0      0  