
# Data loading

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import numpy.random as rand
from scipy import stats
from sklearn import preprocessing 

import re
import json

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from AutoClean import AutoClean

warnings.filterwarnings('ignore')

In [None]:
data=pd.read_csv('./source_data/airbnb.csv')

data.info()

# Data exploration

In [None]:
# Check distribution of target variable

unos=0
zeros=0
for i in range(len(data['Rating'])):
    if data['Rating'][i] == 'Y':
        unos+=1
    else:
        zeros+=1

sns.countplot(x = data['Rating'])
print(f"N={zeros} - Y={unos}")
plt.show()

In [None]:
plt.figure(figsize=(15,8), dpi =500)
sns.heatmap(data.corr(method='pearson'),annot=True,fmt=".2f", linewidth=.5)
plt.show()

In [None]:
data.isnull().sum()

In [None]:
new_columns = []
for column in data.columns:
    new_col = re.sub("[()%=.-/]", '', column)
    new_col = new_col.replace(' ', '_')
    if new_col[-1] == '_':
        new_col = new_col[:-1]

    #print(column + " -> " + new_col)
    data.rename(columns={column: new_col}, inplace=True)

# replacing values
label_encoder = preprocessing.LabelEncoder() 
data['Rating']= label_encoder.fit_transform(data['Rating']) 
data['LocationName']= label_encoder.fit_transform(data['LocationName']) 

data.info()
data.to_csv('./airbnb_trim.csv', index=False, sep=',')

In [None]:
# Split data into training and testing sets
X = data.drop("Rating", axis=1)  # Features
y = data["Rating"]  # Target variable

# Starting Model

In [None]:
# Splitting dataset
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2)

In [None]:
# Define the parameter grid for GridSearchCV
param_grid = {
    "n_estimators": [50, 100, 200],  # Number of trees
    "max_depth": [4, 8, 16],  # Maximum depth of each tree
    "min_samples_split": [2, 5, 10],  # Minimum number of samples to split a node
    "min_samples_leaf": [1, 2, 4],  # Minimum number of samples in each leaf
    "max_features": ["sqrt", "log2"],
    "random_state": [0, 100, 1000]
}

# Create the GridSearchCV object
model = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring="accuracy")

# Fit the model to the training data
model.fit(X_train, y_train)

# Get the best hyperparameters and best score
best_params = model.best_params_
best_score = model.best_score_


print("Best Hyperparameters:", best_params)
print("Best Score:", best_score)

In [None]:
# Building Model
best_params = {'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 1000}
clf = RandomForestClassifier(**best_params)

# Training the model on the training dataset
clf.fit(X_train, y_train)

# performing predictions
y_pred = clf.predict(X_test)
accuracy=metrics.accuracy_score(y_test,y_pred)
report=classification_report(y_test,y_pred)

print("Classification Report:")
print(report)

# Make data dirty

### Adding Outliers Values

In [None]:
def add_outliers(df, outlier_percent):
    """
    Changes a specified percentage of values in a DataFrame to values outside the 
    existing feature range.

    Args:
        df (pandas.DataFrame): The input DataFrame.
        outlier_percent (float): The percentage of values to change (0.0 to 1.0).

    Returns:
        pandas.DataFrame: The modified DataFrame with outliers introduced.
    """

    if outlier_percent < 0 or outlier_percent > 1:
        raise ValueError("outlier_percent must be between 0 and 1")

    # Create a mask with True for values to change
    outlier_mask = rand.rand(df.shape[0], df.shape[1]) < outlier_percent
    df_outliers = df.copy()

    for col in df.columns:
        # Get minimum and maximum values (excluding potential existing outliers)
        min_val = df.min(axis=0)[col]
        max_val = df.max(axis=0)[col]

        # Generate random values outside the range for outliers
        df_outliers[col] = [rand.uniform((min_val*0.3)-df.loc[i, col], (max_val*0.3)-df.loc[i, col]) for i in range(df.shape[0])]

    
    df_outliers = df_outliers * outlier_mask

    return df + df_outliers

### Change distribution

In [None]:
def add_gaussian_noise(df, noise_percent):
  """
  Adds Gaussian noise to specified columns in a DataFrame for a given percentage.

  Args:
      df (pandas.DataFrame): The input DataFrame.
      noise_percent (float): The percentage of standard deviation to add as noise (0.0 to 1.0).
      columns (list): A list of column names to add noise to.

  Returns:
      pandas.DataFrame: The modified DataFrame with Gaussian noise added.
  """

  if noise_percent < 0 or noise_percent > 1:
    raise ValueError("noise_percent must be between 0.0 and 1.0")

# Create a mask with True for values to change
  outlier_mask = rand.rand(df.shape[0], df.shape[1]) < noise_percent
  df_gauss = df.copy()

  for col in df_gauss.columns:
    # Get standard deviation for noise based on percentage
    std_dev = df_gauss[col].std() * noise_percent


    gaussian_noise = np.random.normal(0, std_dev, df_gauss.shape[0])
    df_gauss[col] = df_gauss[col] + gaussian_noise

  
  df_gauss = df_gauss * outlier_mask

  return df + df_gauss


### Adding Missing Values

In [None]:
def add_missing_values(df, missing_percent):
    """
    Adds missing values randomly to a DataFrame for a specified percentage.

    Args:
        df (pandas.DataFrame): The input DataFrame.
        missing_percent (float): The percentage of missing values to add (0.0 to 1.0).

    Returns:
        pandas.DataFrame: The modified DataFrame with missing values added.
    """

    if missing_percent < 0 or missing_percent > 1:
        raise ValueError("missing_percent must be between 0 and 1")

    # Create a mask with True for missing values based on the percentage
    missing_mask = np.random.rand(df.shape[0], df.shape[1]) < missing_percent

    # Replace existing values with NaN based on the mask
    df_with_missing = df.where(~missing_mask, np.NAN)

    return df_with_missing


### Creating dirty data

In [None]:
# 10, 20, 30 percent of data missing
dirty_data_10 = add_outliers(data.copy(), 0.04)
dirty_data_10 = add_gaussian_noise(dirty_data_10, 0.03)
dirty_data_10 = add_missing_values(dirty_data_10, 0.04)

dirty_data_20 = add_outliers(data.copy(), 0.07)
dirty_data_20 = add_gaussian_noise(dirty_data_20, 0.07)
dirty_data_20 = add_missing_values(dirty_data_20, 0.08)

dirty_data_30 = add_outliers(data.copy(), 0.12)
dirty_data_30 = add_gaussian_noise(dirty_data_30, 0.1)
dirty_data_30 = add_missing_values(dirty_data_30, 0.12)

# Salving
dirty_data_10.to_csv('./dirty_data/dirty_data_10.csv', index=False, sep=',')
dirty_data_20.to_csv('./dirty_data/dirty_data_20.csv', index=False, sep=',')
dirty_data_30.to_csv('./dirty_data/dirty_data_30.csv', index=False, sep=',')

In [None]:
differences = data.count().sum() - (data == dirty_data_10).astype(int).sum().sum()
total_val = data.shape[0]*data.shape[1]
print("dirty_data_10:")
print(f"Different values: {differences} out of {total_val}")
print(f"Percentage: {round((differences*100)/total_val, 2)}%")
print()

differences = data.count().sum() - (data == dirty_data_20).astype(int).sum().sum()
print("dirty_data_20:")
print(f"Different values: {differences} out of {total_val}")
print(f"Percentage: {round((differences*100)/total_val, 2)}%")
print()

differences = data.count().sum() - (data == dirty_data_30).astype(int).sum().sum()
print("dirty_data_30:")
print(f"Different values: {differences} out of {total_val}")
print(f"Percentage: {round((differences*100)/total_val, 2)}%")
print()

# Cleaning data

## Manual cleaning

In [None]:
# handle missing values
def remove_missing(dirty_data, MODE='mean'):
    print(f"Null data before:  {dirty_data.isna().any().sum()}")

    if dirty_data.isna().any().sum() > 0:
        if MODE == 'mean':
            for column in dirty_data.columns:
                if dirty_data[column].isnull().values.any():
                    dirty_data[column].fillna(float(dirty_data[column].mean()), inplace=True)
        elif MODE == 'delete':
            dirty_data.dropna(inplace=True)
        elif MODE == 'zeros':
            dirty_data.fillna(0, inplace=True)

        print(f"Null data after:  {dirty_data.isna().any().sum()}")

In [None]:
# remove outliers
def remove_outliers(dirty_data, threshold_z=2):
    print("Original Dataframe shape:", dirty_data.shape)
    for column in dirty_data.columns:
        z = np.abs(stats.zscore(dirty_data[column]))

        outlier_indices = np.where(z > threshold_z)[0]
        no_outliers = dirty_data.drop(outlier_indices)
    
    print("Dataframe shape after removing outliers:", no_outliers.shape)


In [None]:
def encode_y(data):
    label_encoder = preprocessing.LabelEncoder() 
    data['Rating']= label_encoder.fit_transform(data['Rating']) 

    y = data['Rating']
    y_mean = data['Rating'].mean()
    for i in range(len(y)):
        if y[i] >= y_mean:
            y[i]=1
        else:
            y[i]=0

    return y

### 10%

In [None]:
dirty_data = pd.read_csv('./dirty_data/dirty_data_10.csv')
dirty_data.info()

In [None]:
# Check distribution of target variable
y = dirty_data['Rating']
y_mean = dirty_data['Rating'].mean()
unos=0
zeros=0
for i in range(len(y)):
    if y[i] >= y_mean:
        y[i]=1
        unos+=1
    else:
        y[i]=0
        zeros+=1


sns.countplot(x = y)
print(f"N={zeros} - Y={unos}")
plt.show()

In [None]:
remove_missing(dirty_data)
remove_outliers(dirty_data)

In [None]:
# Split data into training and testing sets
X = dirty_data.drop("Rating", axis=1)  # Features
y = encode_y(dirty_data)

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Building Model
best_params = {'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 1000}
clf = RandomForestClassifier(**best_params)

# Training the model on the training dataset
clf.fit(X_train, y_train)

# performing predictions
y_pred = clf.predict(X_test)
accuracy=metrics.accuracy_score(y_test,y_pred)
report=classification_report(y_test,y_pred)

print("Classification Report:")
print(report)


### 20%

In [None]:
dirty_data = pd.read_csv('./dirty_data/dirty_data_20.csv')
dirty_data.info()

In [None]:
# Check distribution of target variable
y = dirty_data['Rating']
y_mean = dirty_data['Rating'].mean()
unos=0
zeros=0
for i in range(len(y)):
    if y[i] >= y_mean:
        y[i]=1
        unos+=1
    else:
        y[i]=0
        zeros+=1


sns.countplot(x = y)
print(f"N={zeros} - Y={unos}")
plt.show()

In [None]:
remove_missing(dirty_data)
remove_outliers(dirty_data)

In [None]:
# Split data into training and testing sets
X = dirty_data.drop("Rating", axis=1)  # Features
y = encode_y(dirty_data)

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Building Model
best_params = {'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 1000}
clf = RandomForestClassifier(**best_params)

# Training the model on the training dataset
clf.fit(X_train, y_train)

# performing predictions
y_pred = clf.predict(X_test)
accuracy=metrics.accuracy_score(y_test,y_pred)
report=classification_report(y_test,y_pred)

print("Classification Report:")
print(report)

### 30%

In [None]:
dirty_data = pd.read_csv('./dirty_data/dirty_data_30.csv')
dirty_data.info()

In [None]:
# Check distribution of target variable
y = dirty_data['Rating']
y_mean = dirty_data['Rating'].mean()
unos=0
zeros=0
for i in range(len(y)):
    if y[i] >= y_mean:
        y[i]=1
        unos+=1
    else:
        y[i]=0
        zeros+=1


sns.countplot(x = y)
print(f"N={zeros} - Y={unos}")
plt.show()

In [None]:
remove_missing(dirty_data)
remove_outliers(dirty_data)

In [None]:
# Split data into training and testing sets
X = dirty_data.drop("Rating", axis=1)  # Features
y = encode_y(dirty_data)

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Building Model
best_params = {'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 1000}
clf = RandomForestClassifier(**best_params)

# Training the model on the training dataset
clf.fit(X_train, y_train)

# performing predictions
y_pred = clf.predict(X_test)
accuracy=metrics.accuracy_score(y_test,y_pred)
report=classification_report(y_test,y_pred)

print("Classification Report:")
print(report)

## HoloClean
https://github.com/HoloClean/holoclean/tree/master

Conda environment: holoclean37

With Holoclean the results are stored as table in Postgresql, it also allow to evaluate result if you give it a clean dataset

python holoclean/examples/holoclean_reapair_airbnb.py [database list]

In [None]:
def load_data(path):
    path = './clean_data/HoloClean/' + path
    return pd.read_csv(path)

def get_differences(hc_data):
    # Differences between clean data and source data
    differences = data.count().sum() - (data == hc_data).astype(int).sum().sum()
    total_val = data.shape[0]*data.shape[1]
    print(f"Different values: {differences} out of {total_val}")
    print(f"Percentage: {round((differences*100)/total_val, 2)}%")
    print(f"Null data:  {hc_data.isna().any().sum()}")

def manage_nan(hc_data, DELETE=False):
    # Replace NaN values
    if DELETE:
        hc_data.dropna(inplace=True)

    else:
        for column in hc_data.columns:
            if hc_data[column].isnull().values.any():
                hc_data[column].fillna(float(hc_data[column].mean()), inplace=True)

    print(f"Null data:  {hc_data.isna().any().sum()}")

In [None]:
def make_split(hc_data):
    # Encoding rating
    label_encoder = preprocessing.LabelEncoder() 
    hc_data['Rating']= label_encoder.fit_transform(hc_data['Rating']) 

    # Split data into training and testing sets
    X = hc_data.drop("Rating", axis=1)  # Features
    y = hc_data["Rating"]  # Target variable

    y_mean = hc_data['Rating'].mean()
    for i in range(len(y)):
        if y[i] >= y_mean:
            y[i]=1
        else:
            y[i]=0

    # Splitting dataset
    return train_test_split(X, y, test_size=0.2)

def train_model(X_train, X_test, y_train, y_test):
    # Building Model
    clf = RandomForestClassifier(**best_params)

    # Training the model on the training dataset
    clf.fit(X_train, y_train)

    # performing predictions
    y_pred = clf.predict(X_test)
    accuracy=metrics.accuracy_score(y_test,y_pred)
    report=classification_report(y_test,y_pred)

    print("Classification Report:")
    print(report)

### 10% Data

In [None]:
hc_data = load_data('dirty_data_10_repaired.csv')
hc_data.info()

In [None]:
# Check distribution of target variable
y = hc_data['Rating']
y_mean = hc_data['Rating'].mean()
unos=0
zeros=0
for i in range(len(y)):
    if y[i] >= y_mean:
        y[i]=1
        unos+=1
    else:
        y[i]=0
        zeros+=1


sns.countplot(x = y)
print(f"N={zeros} - Y={unos}")
plt.show()

In [None]:
get_differences(hc_data)

In [None]:
manage_nan(hc_data)

In [None]:
X_train, X_test, y_train, y_test = make_split(hc_data)
train_model(X_train, X_test, y_train, y_test)

### 20% Data

In [None]:
hc_data = load_data('dirty_data_20_repaired.csv')
hc_data.info()

In [None]:
# Check distribution of target variable
y = hc_data['Rating']
y_mean = hc_data['Rating'].mean()
unos=0
zeros=0
for i in range(len(y)):
    if y[i] >= y_mean:
        y[i]=1
        unos+=1
    else:
        y[i]=0
        zeros+=1


sns.countplot(x = y)
print(f"N={zeros} - Y={unos}")
plt.show()

In [None]:
get_differences(hc_data)

In [None]:
manage_nan(hc_data)

In [None]:
X_train, X_test, y_train, y_test = make_split(hc_data)
train_model(X_train, X_test, y_train, y_test)

### 30% Data

In [None]:
hc_data = load_data('dirty_data_30_repaired.csv')
hc_data.info()

In [None]:
# Check distribution of target variable
y = hc_data['Rating']
y_mean = hc_data['Rating'].mean()
unos=0
zeros=0
for i in range(len(y)):
    if y[i] >= y_mean:
        y[i]=1
        unos+=1
    else:
        y[i]=0
        zeros+=1


sns.countplot(x = y)
print(f"N={zeros} - Y={unos}")
plt.show()

In [None]:
get_differences(hc_data)

In [None]:
manage_nan(hc_data)

In [None]:
X_train, X_test, y_train, y_test = make_split(hc_data)
train_model(X_train, X_test, y_train, y_test)

## CleanML
https://github.com/chu-data-lab/CleanML

Conda environment: CleanML

Mst add dataset into databases with the present errors, in this case it is modified to do not run holoclean cleaning because we already did.
it has various method of cleaning listed and after clean the data it performs the machine learning task givin in output a json file with the results of the various experiments.

The Random forest classifier is trained into CleanML directly, so we can analyze the result directly.
If we want to reproduce data we have the csv files for each run, so we find the best result under this section and load data

python3 main.py --run_experiments

In [None]:
def get_best(path):
    path = './clean_data/CleanML/' + path
    cleanml = json.load(open(path))

    best = []
    for item in cleanml:
        if best == []:
            best = item

        if cleanml[item]['train_acc'] > cleanml[best]['train_acc']:
            best = item

    print(best)
    print(json.dumps(cleanml[best], indent = 4, sort_keys=True))

### 10%

In [None]:
get_best('dirty_10_result.json')

### 20%

In [None]:
get_best('dirty_20_result.json')

### 30%

In [None]:
get_best('dirty_30_result.json')

### Reproduce

In [None]:
def load_data(path):
    path = './clean_data/CleanML/' + path
    return pd.read_csv(path)

def get_differences(hc_data):
    # Differences between clean data and source data
    data=pd.read_csv('./source_data/airbnb_trim.csv')

    differences = data[:hc_data.shape[1]].count().sum() - (data == hc_data).astype(int).sum().sum()
    total_val = hc_data.shape[0]*hc_data.shape[1]
    print(f"Different values: {differences} out of {total_val}")
    print(f"Percentage: {round((differences*100)/total_val, 2)}%")
    print(f"Null data:  {hc_data.isna().any().sum()}")

def manage_nan(hc_data, DELETE=False):
    # Replace NaN values
    if DELETE:
        hc_data.dropna(inplace=True)

    else:
        for column in hc_data.columns:
            if hc_data[column].isnull().values.any():
                hc_data[column].fillna(float(hc_data[column].mean()), inplace=True)

    print(f"Null data:  {hc_data.isna().any().sum()}")

In [None]:
def encode_y(data):
    label_encoder = preprocessing.LabelEncoder() 
    data['Rating']= label_encoder.fit_transform(data['Rating']) 

    y = data['Rating']
    y_mean = data['Rating'].mean()
    for i in range(len(y)):
        if y[i] >= y_mean:
            y[i]=1
        else:
            y[i]=0

    return y

def train_model(X_train, X_test, y_train, y_test):
    # Building Model
    clf = RandomForestClassifier(**best_params)

    # Training the model on the training dataset
    clf.fit(X_train, y_train)

    # performing predictions
    y_pred = clf.predict(X_test)
    accuracy=metrics.accuracy_score(y_test,y_pred)
    report=classification_report(y_test,y_pred)

    print("Classification Report:")
    print(report)

In [None]:
train = load_data('dirty_10/outliers/clean_IQR_impute_mode_dummy_train.csv')
test = load_data('dirty_10/outliers/clean_IQR_impute_mode_dummy_test.csv')
dataset = pd.concat([train, test], ignore_index=True)
dataset.info()

In [None]:
# Check distribution of target variable
y = dataset['Rating']
y_mean = dataset['Rating'].mean()
unos=0
zeros=0
for i in range(len(y)):
    if y[i] >= y_mean:
        y[i]=1
        unos+=1
    else:
        y[i]=0
        zeros+=1


sns.countplot(x = y)
print(f"N={zeros} - Y={unos}")
plt.show()

In [None]:
X_train = train.drop("Rating", axis=1)
X_test = test.drop("Rating", axis=1)

y_train = encode_y(train)
y_test = encode_y(test)

train_model(X_train, X_test, y_train, y_test)

## AutoClean
https://github.com/elisemercury/AutoClean

Python package to autoclean, easy to use and updated

### 10%

In [None]:
data = pd.read_csv('./dirty_data/dirty_data_10.csv')
pipeline = AutoClean(data, missing_num='mean')

dataset = pipeline.output
dataset.to_csv('./clean_data/AutoClean/dirty_10_clean.csv', index=False, sep=',')
dataset.info()

In [None]:
# Check distribution of target variable
y = dataset['Rating']
y_mean = dataset['Rating'].mean()
unos=0
zeros=0
for i in range(len(y)):
    if y[i] >= y_mean:
        y[i]=1
        unos+=1
    else:
        y[i]=0
        zeros+=1


sns.countplot(x = y)
print(f"N={zeros} - Y={unos}")
plt.show()

In [None]:
get_differences(dataset)

In [None]:
X_train, X_test, y_train, y_test = make_split(dataset)
train_model(X_train, X_test, y_train, y_test)

### 20%

In [None]:
data = pd.read_csv('./dirty_data/dirty_data_20.csv')
pipeline = AutoClean(data, missing_num='mean')

dataset = pipeline.output
dataset.to_csv('./clean_data/AutoClean/dirty_20_clean.csv', index=False, sep=',')
dataset.info()

In [None]:
# Check distribution of target variable
y = dataset['Rating']
y_mean = dataset['Rating'].mean()
unos=0
zeros=0
for i in range(len(y)):
    if y[i] >= y_mean:
        y[i]=1
        unos+=1
    else:
        y[i]=0
        zeros+=1


sns.countplot(x = y)
print(f"N={zeros} - Y={unos}")
plt.show()

In [None]:
# Differences between clean data and source data
hc_data=pd.read_csv('./source_data/airbnb_trim.csv')

differences = data[:hc_data.shape[1]].count().sum() - (data == hc_data).astype(int).sum().sum()
total_val = hc_data.shape[0]*hc_data.shape[1]
print(f"Different values: {differences} out of {total_val}")
print(f"Percentage: {round((differences*100)/total_val, 2)}%")
print(f"Null data:  {hc_data.isna().any().sum()}")

In [None]:
X_train, X_test, y_train, y_test = make_split(dataset)
train_model(X_train, X_test, y_train, y_test)

### 30%

In [None]:
data = pd.read_csv('./dirty_data/dirty_data_30.csv')
pipeline = AutoClean(data, missing_num='mean')

dataset = pipeline.output
dataset.to_csv('./clean_data/AutoClean/dirty_30_clean.csv', index=False, sep=',')
dataset.info()

In [None]:
# Check distribution of target variable
y = dataset['Rating']
y_mean = dataset['Rating'].mean()
unos=0
zeros=0
for i in range(len(y)):
    if y[i] >= y_mean:
        y[i]=1
        unos+=1
    else:
        y[i]=0
        zeros+=1


sns.countplot(x = y)
print(f"N={zeros} - Y={unos}")
plt.show()

In [None]:
# Differences between clean data and source data
hc_data=pd.read_csv('./source_data/airbnb_trim.csv')

differences = data[:hc_data.shape[1]].count().sum() - (data == hc_data).astype(int).sum().sum()
total_val = hc_data.shape[0]*hc_data.shape[1]
print(f"Different values: {differences} out of {total_val}")
print(f"Percentage: {round((differences*100)/total_val, 2)}%")
print(f"Null data:  {hc_data.isna().any().sum()}")

In [None]:
X_train, X_test, y_train, y_test = make_split(dataset)
train_model(X_train, X_test, y_train, y_test)