<a href="https://colab.research.google.com/github/saithejagonavaram/data-prep-project/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [152]:
#Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import missingno as msno
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler, PowerTransformer, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import LocalOutlierFactor

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier


df_original = pd.read_csv('/content/drive/MyDrive/data_set/cirrhosis.csv')

df_original.drop(columns = ["ID"], inplace = True)
df_original['Age'] = (df_original['Age'].values/365).round(1)


print(df_original.isnull().sum())


print(df_original.describe(include= "object"))

N_Days             0
Status             0
Drug             106
Age                0
Sex                0
Ascites          106
Hepatomegaly     106
Spiders          106
Edema              0
Bilirubin          0
Cholesterol      134
Albumin            0
Copper           108
Alk_Phos         106
SGOT             106
Tryglicerides    136
Platelets         11
Prothrombin        2
Stage              6
dtype: int64
       Status             Drug  Sex Ascites Hepatomegaly Spiders Edema
count     418              312  418     312          312     312   418
unique      3                2    2       2            2       2     3
top         C  D-penicillamine    F       N            Y       N     N
freq      232              158  374     288          160     222   354


Missing Values

In [153]:
def quick_missing_imp(data, num_method="median", cat_length=20, target="Stage"):
    variables_with_na = [col for col in data.columns if data[col].isnull().sum() > 0]  # Lists the variables with missing values

    temp_target = data[target]

    print("# BEFORE")
    print(data[variables_with_na].isnull().sum(), "\n\n")  # Number of missing values in variables before the application

    # If the variable is object type and has a number of unique values less than or equal to cat_length, fill the missing values with mode
    data = data.apply(lambda x: x.fillna(x.mode()[0]) if (x.dtype == "O" and len(x.unique()) <= cat_length) else x, axis=0)

    # If num_method is 'mean', fill the missing values of non-object type variables with the mean
    if num_method == "mean":
        data = data.apply(lambda x: x.fillna(x.mean()) if x.dtype != "O" else x, axis=0)
    # If num_method is 'median', fill the missing values of non-object type variables with the median
    elif num_method == "median":
        data = data.apply(lambda x: x.fillna(x.median()) if x.dtype != "O" else x, axis=0)

    data[target] = temp_target

    print("# AFTER \n Imputation method is 'MODE' for categorical variables!")
    print(" Imputation method is '" + num_method.upper() + "' for numeric variables! \n")
    print(data[variables_with_na].isnull().sum(), "\n\n")

    return data

df_original = quick_missing_imp(df_original, num_method="median", cat_length=17)

# dropping missing values of Stage
df_original.dropna(inplace=True)

print(df_original.isnull().sum())

# BEFORE
Drug             106
Ascites          106
Hepatomegaly     106
Spiders          106
Cholesterol      134
Copper           108
Alk_Phos         106
SGOT             106
Tryglicerides    136
Platelets         11
Prothrombin        2
Stage              6
dtype: int64 


# AFTER 
 Imputation method is 'MODE' for categorical variables!
 Imputation method is 'MEDIAN' for numeric variables! 

Drug             0
Ascites          0
Hepatomegaly     0
Spiders          0
Cholesterol      0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            6
dtype: int64 


N_Days           0
Status           0
Drug             0
Age              0
Sex              0
Ascites          0
Hepatomegaly     0
Spiders          0
Edema            0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage           

In [154]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    """

    This function takes a dataframe as input and returns the names of categorical, numerical, and categorical but cardinal variables.
    Note: Numerical variables with categorical appearance are also included in the categorical variables.

    Parameters
    ----------
    dataframe: dataframe
            Dataframe from which variable names are to be extracted.
    cat_th: int, optional
            threshold value for numerical but categorical variables.
    car_th: int, optional
            threshold value for categorical but cardinal variables.

    Returns
    -------
    cat_cols: list
            List of categorical variable names.
    num_cols: list
            List of numerical variable names.
    cat_but_car:list
            List of categorical (but cardinal) variable names.

    Notes
    -------
        cat_cols + num_cols + cat_but_car = total number of variables
        The variable "num_but_cat" is included in "cat_cols" (the list of categorical variables).

    """

    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]

    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    #num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observation: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f"cat_cols: {len(cat_cols)}")
    print(f"num_cols: {len(num_cols)}")
    print(f"cat_but_car: {len(cat_but_car)}")
    print(f"num_but_cat: {len(num_but_cat)}")

    return cat_cols, num_cols, cat_but_car

cat_cols, num_cols, cat_but_car = grab_col_names(df_original)

Observation: 412
Variables: 19
cat_cols: 8
num_cols: 11
cat_but_car: 0
num_but_cat: 1


In [156]:
cat_cols = [col for col in cat_cols if col not in ["Stage"]]
cat_cols

['Status', 'Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']

Label Encoding

In [157]:
def label_encoder(dataframe, binary_col, drop_first=True):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe
binary_cols = [col for col in df_original.columns if df_original[col].dtype not in [int, float] and df_original[col].nunique() == 2]
print(binary_cols)

['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders']


In [158]:
for col in binary_cols:
    label_encoder(df_original, col)

df_original.head()


Unnamed: 0,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,400,D,0,58.8,0,1,1,1,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,4500,C,0,56.5,0,0,1,1,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,1012,D,0,70.1,1,0,0,0,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,1925,D,0,54.8,0,0,1,1,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,1504,CL,1,38.1,0,0,1,1,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


One Hot Encoding

In [159]:
def one_hot_encoder(dataframe, categorical_cols, drop_first=True):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

df_original = one_hot_encoder(df_original, cat_cols, drop_first=True)


Base Model

In [160]:
data_base = df_original.copy()

target_variable = 'Stage'

data_base.fillna(0)

y_data = data_base[target_variable]
x_data = data_base.drop(columns=[target_variable])

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=42)

model = RandomForestClassifier(random_state=17)

model = model.fit(x_train, y_train)

predict = model.predict(x_test)

score = accuracy_score(y_test, predict)

print("Baseline Accuracy:", score)


Average Accuracy: 0.47580645161290325
