In [None]:
#Importing of the right libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets, model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [None]:
#Input file path

input_file_path = r"https://raw.githubusercontent.com/snoozenuzs/gh-dataset/main/nhgh.tsv"

temp_df = pd.read_csv(input_file_path, sep='\t', header=0)

In [None]:
#DataFrame information

print(temp_df.info())

In [None]:
# Data Pre - processing

critical_columns = ['seqn', 'sex', 'age', 're', 'income',
                   'tx', 'dx','wt','ht', 'bmi',
                   'leg', 'arml', 'armc', 'waist', 'tri',
                   'sub', 'gh', 'albumin', 'bun','SCr']

temp_df_cleaned = temp_df.dropna(subset=critical_columns)

In [None]:
#Encoding for the 3 categorical features(sex, rec, income)

# Encode 'sex' column
temp_df_cleaned['sex'] = temp_df_cleaned['sex'].map({'male': 0, 'female': 1})


# Define the order for the ordinal encoding of the income ranges
income_order = ['[0,5000)', '[5000,10000)', '[10000,15000)', '[15000,20000)', '[20000,25000)',
                '[25000,35000)', '[35000,45000)', '[45000,55000)', '[55000,65000)', '[65000,75000)',
                '< 20000', '> 20000', '[75000,100000)', '>= 100000']

# Apply ordinal encoding to 'income' column
ordinal_encoder = OrdinalEncoder(categories=[income_order])
temp_df_cleaned['income'] = ordinal_encoder.fit_transform(temp_df_cleaned[['income']])

# Initialize the LabelEncoder
label_encoder_re = LabelEncoder()

# Apply label encoding to 're' column
temp_df_cleaned['re'] = label_encoder_re.fit_transform(temp_df_cleaned['re'])


In [None]:
#Scaling numerical columns

# Identify numerical columns
numerical_cols = temp_df_cleaned.select_dtypes(include=['float64', 'int64']).columns

# Standardize numerical columns
scaler = StandardScaler()
temp_df_cleaned[numerical_cols] = scaler.fit_transform(temp_df_cleaned[numerical_cols])

In [None]:
#checking of final temp_df_cleaned and building a correlation matrix amongst the features to understand the dataset
print(temp_df_cleaned.head(5))

# Building a correlation matrix using Seaborn to understand the data

# Compute the correlation matrix
corr = temp_df_cleaned.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

plt.show()                         

In [None]:
"""The gh column is chosen as the feature column for prediction in the models later and hence dropped from the datafram
To proceed and define it further as target variable Y"""

#Feature X columns, without gh
X = temp_df_cleaned.drop(columns=['gh'])


#gh being the target variable(anything that is more than or equal to 6.5 is encoded to 1, and others 0)
y = (temp_df_cleaned['gh']>=6.5).astype(int)


In [None]:
# Splitting the data into training and testing sets, with 20% allocated for testing and 80% for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Display the shapes of the train and test sets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


In [None]:
#Logistic Regression Model

# Initialize the model
log_reg_model = LogisticRegression()

# Train the model
log_reg_model.fit(X_train, y_train)

# Evaluate the model
log_reg_accuracy = log_reg_model.score(X_test, y_test)
print("Logistic Regression Accuracy:", log_reg_accuracy)

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the model
decision_tree_model = DecisionTreeClassifier()

# Train the model
decision_tree_model.fit(X_train, y_train)

# Evaluate the model
decision_tree_accuracy = decision_tree_model.score(X_test, y_test)
print("Decision Trees Accuracy:", decision_tree_accuracy)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
random_forest_model = RandomForestClassifier()

# Train the model
random_forest_model.fit(X_train, y_train)

# Evaluate the model
random_forest_accuracy = random_forest_model.score(X_test, y_test)
print("Random Forests Accuracy:", random_forest_accuracy)

In [None]:
#finding the best hyper parameters for the Logistic Regression Model 

from sklearn.model_selection import GridSearchCV

# Define hyperparameters to search
param_grid_logistic = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l1', 'l2'],  # Penalty (L1 or L2 regularization)
    'max_iter': [1000, 10000, 100000]  # Maximum number of iterations
}
# Initialize GridSearchCV with 20 cross validation 
log_reg_grid_search = GridSearchCV(LogisticRegression(), param_grid_logistic, cv=20)

# Perform Grid Search
log_reg_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_log_reg_params = log_reg_grid_search.best_params_
print("Best Hyperparameters for Logistic Regression:", best_log_reg_params)


In [None]:
# Initialize the model with the best hyperparameters
best_log_reg_model = LogisticRegression(**best_log_reg_params)

# Train the model
best_log_reg_model.fit(X_train, y_train)

# Evaluate the model
best_log_reg_accuracy = best_log_reg_model.score(X_test, y_test)
print("Best Logistic Regression Accuracy after Hyperparameter Tuning:", round(best_log_reg_accuracy,3))

In [None]:
#Finding hyperparameters for decision tree tuning 
param_grid_decision_tree = {
    'max_depth': [None, 10, 20, 30, 40, 50],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

# Initialize GridSearchCV
decision_tree_grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid_decision_tree, cv=5)

# Perform Grid Search
decision_tree_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_decision_tree_params = decision_tree_grid_search.best_params_
print("Best Hyperparameters for Decision Trees:", best_decision_tree_params)

In [None]:
#Building the revised decision tree model
# Initialize the model with the best hyperparameters
best_decision_tree_model = DecisionTreeClassifier(**best_decision_tree_params)

# Train the model
best_decision_tree_model.fit(X_train, y_train)

# Evaluate the model
best_decision_tree_accuracy = best_decision_tree_model.score(X_test, y_test)
print("Best Decision Trees Accuracy after Hyperparameter Tuning:", round(best_decision_tree_accuracy, 3))

In [None]:
# Define a reduced search space for RandomizedSearchCV
param_dist_random_forest = {
    'n_estimators': randint(100, 300),  # Sample values between 100 and 300
    'max_depth': [None] + list(range(10, 31)),  # Include None and values between 10 and 30
    'min_samples_split': randint(2, 11),  # Sample values between 2 and 10
    'min_samples_leaf': randint(1, 5)  # Sample values between 1 and 4
}

# Initialize RandomizedSearchCV
random_forest_random_search = RandomizedSearchCV(RandomForestClassifier(), 
                                                 param_distributions=param_dist_random_forest, 
                                                 n_iter=50,  # Number of parameter settings that are sampled
                                                 cv=5, 
                                                 n_jobs=-1)  # Use all available cores

# Perform Randomized Search
random_forest_random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_random_forest_params = random_forest_random_search.best_params_
print("Best Hyperparameters for Random Forests:", best_random_forest_params)


In [None]:
#Building the hyperparameter random forest model

# Initialize the model with the best hyperparameters
best_random_forest_model = RandomForestClassifier(**best_random_forest_params)

# Train the model
best_random_forest_model.fit(X_train, y_train)

# Evaluate the model
best_random_forest_accuracy = best_random_forest_model.score(X_test, y_test)
print("Best Random Forests Accuracy after Hyperparameter Tuning:", round(best_random_forest_accuracy,3))

In [None]:
# Sample test data
test_data = {
    'seqn': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'sex': ['male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female'],
    'age': [50, 35, 45, 55, 40, 60, 25, 70, 30, 65],
    're': ['Mexican American', 'Non-Hispanic White', 'Non-Hispanic Black', 'Other Hispanic', 'Other Race Including Multi-Racial', 'Mexican American', 'Non-Hispanic White', 'Non-Hispanic Black', 'Other Hispanic', 'Other Race Including Multi-Racial'],
    'income': ['[5000,10000)', '[15000,20000)', '[25000,35000)', '[35000,45000)', '[45000,55000)', '[55000,65000)', '[65000,75000)', '[75000,100000)', '< 20000', '> 20000'],
    'tx': [1, 0, 1, 1, 0, 1, 0, 1, 0, 1],
    'dx': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
    'wt': [70, 60, 80, 65, 75, 85, 55, 90, 50, 95],
    'ht': [170, 165, 175, 160, 180, 155, 185, 150, 190, 145],
    'bmi': [24, 22, 26, 25, 27, 23, 28, 21, 29, 20],
    'leg': [80, 75, 85, 70, 90, 65, 95, 60, 100, 55],
    'arml': [30, 28, 32, 27, 33, 26, 34, 25, 35, 24],
    'armc': [28, 26, 30, 25, 31, 24, 32, 23, 33, 22],
    'waist': [75, 70, 80, 68, 82, 66, 84, 64, 86, 62],
    'tri': [15, 12, 18, 10, 20, 8, 22, 6, 24, 4],
    'sub': [20, 18, 22, 16, 24, 14, 26, 12, 28, 10],
    'gh': [7, 6.5, 7.5, 6, 8, 5.5, 8.5, 5, 9, 4.5],
    'albumin': [4.5, 4.0, 5.0, 3.5, 5.5, 3.0, 6.0, 2.5, 6.5, 2.0],
    'bun': [10, 8, 12, 7, 13, 6, 14, 5, 15, 4],
    'SCr': [0.8, 0.7, 0.9, 0.6, 1.0, 0.5, 1.1, 0.4, 1.2, 0.3]
}

# Create DataFrame from test data
test_df = pd.DataFrame(test_data)

print(test_df)


In [None]:
#Applying encoding and scaling to the test data
# Encode 'sex' column
test_df['sex'] = test_df['sex'].map({'male': 0, 'female': 1})


# Apply ordinal encoding to 'income' column using the same ordinal_encoder from training data
test_df['income'] = ordinal_encoder.transform(test_df[['income']])

# Apply label encoding to 're' column in test data
test_df['re'] = label_encoder_re.transform(test_df['re'])

# Standardize numerical columns
test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])


In [None]:
# Exclude the given target variable 'gh' from the test data
test_features = test_df.drop(columns=['gh'])

In [None]:
# Prediction on the test data using Logistic Regression
log_reg_predictions = best_log_reg_model.predict(test_features)

# Prediction on the test data using Decision Tree
dt_predictions = best_decision_tree_model.predict(test_features)

# Prediction on the test data using Random Forest
rf_predictions = best_random_forest_model.predict(test_features)

#Print result of the predictions from three models for sample data
print(log_reg_predictions, dt_predictions, rf_predictions)