### Importing the libraries

In [61]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np

### Importing Dataset and encoding the categorical data

In [62]:
data_set = pd.read_csv("Covid Data.csv")

data_set = data_set[['AGE', 'INTUBED', 'DIABETES', 'COPD', 'ASTHMA', 'OBESITY', 'PATIENT_TYPE']]

# Replacing invalid codes with NaN
data_set.replace([97, 98, 99], np.nan, inplace=True)

# Creating a new column 'CHRONIC_DISEASE' combining conditions
data_set['CHRONIC_DISEASE'] = data_set[['DIABETES', 'COPD', 'ASTHMA', 'OBESITY']].max(axis=1)
data_set.drop(['DIABETES', 'COPD', 'ASTHMA', 'OBESITY'], axis=1, inplace=True)

# Converting INTUBED and CHRONIC_DISEASE to 1/0
data_set['INTUBED'] = data_set['INTUBED'].map({1: 1, 2: 0})
data_set['CHRONIC_DISEASE'] = data_set['CHRONIC_DISEASE'].map({1: 1, 2: 0})

# Creating a binary target column for hospitalization
data_set['NEEDED_HOSPITALIZATION'] = data_set['PATIENT_TYPE'].map({1: 0, 2: 1})
data_set.drop('PATIENT_TYPE', axis=1, inplace=True)

# Renaming the columns for clarity
data_set.rename(columns={
    'AGE': 'age',
    'INTUBED': 'breathing_issues',
    'CHRONIC_DISEASE': 'chronic_disease',
    'NEEDED_HOSPITALIZATION': 'needed_hospitalization'
}, inplace=True)

data_set.head(10)

X = data_set.iloc[:, :-1].values
y = data_set.iloc[:, -1].values
print(X)
print(y)

[[65. nan  0.]
 [72. nan  0.]
 [55.  1.  0.]
 ...
 [55. nan  0.]
 [28. nan  0.]
 [52. nan  0.]]
[0 0 1 ... 0 0 0]





### Handling the Missing Data

In [63]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Filling in the missing values in all the columns using the most frequent value and mean strategy
# Creating column-wise imputers
ct = ColumnTransformer(transformers=[
    ('age_imputer', SimpleImputer(strategy='mean'), [0]),  # For 'age'
    ('breathing_imputer', SimpleImputer(strategy='most_frequent'), [1]),  # For 'breathing_issues'
    ('chronic_imputer', SimpleImputer(strategy='most_frequent'), [2])  # For 'chronic_disease'
])

X = ct.fit_transform(X)
print(X)

[[65.  0.  0.]
 [72.  0.  0.]
 [55.  1.  0.]
 ...
 [55.  0.  0.]
 [28.  0.  0.]
 [52.  0.  0.]]


### Splitting data into Test set & Training Set


In [64]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(X_train)
print(X_test)
print(y_train)
print(y_test)

[[62.  0.  0.]
 [57.  0.  0.]
 [34.  0.  0.]
 ...
 [38.  0.  0.]
 [46.  0.  0.]
 [43.  0.  0.]]
[[35.  0.  0.]
 [28.  0.  0.]
 [ 7.  0.  0.]
 ...
 [34.  0.  0.]
 [57.  0.  0.]
 [56.  0.  0.]]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


### Feature Scaling

In [65]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

scaler = ColumnTransformer(transformers=[
    ('age_scaler', StandardScaler(), [0]) 
], remainder = 'passthrough')
# We are keeping all the other columns same and just scaling the 'age' column
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(X_train)
print(scaler.named_transformers_['age_scaler'].mean_)
# Gives is the mean value of the age before scaling
# (value that will be subtracted from every age entry during scaling)
print(scaler.named_transformers_['age_scaler'].scale_)
# Gives the standard deviation of the age 
# (value that is used to divide each centred age value)
print("Mean:", X_train[:, 0].mean())        # ~0
print("Std Dev:", X_train[:, 0].std())      # ~1


[[ 1.19875276  0.          0.        ]
 [ 0.90244575  0.          0.        ]
 [-0.46056648  0.          0.        ]
 ...
 [-0.22352088  0.          0.        ]
 [ 0.25057033  0.          0.        ]
 [ 0.07278613  0.          0.        ]]
[41.77177846]
[16.87438998]
Mean: 3.940168094203624e-16
Std Dev: 1.0000000000000002


### Applying Hyperparameter tuning, Training the Data and Evaluating the Model

In [66]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Defining the hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10],  # Regularization strength
    'solver': ['liblinear', 'lbfgs'],
    'penalty': ['l2']  # Only 'l2' is supported with these solvers
}

# Setting up the base model
base_model = LogisticRegression(class_weight='balanced', max_iter=1000)

grid_search = GridSearchCV(base_model, param_grid, cv=5, scoring='f1', verbose=1)

grid_search.fit(X_train, y_train)

# Using the best model to predict
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluating the model
print("Best Parameters:", grid_search.best_params_)
print("\nModel Evaluation Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Parameters: {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}

Model Evaluation Results:
Accuracy: 0.7436664044059795
Precision: 0.3954251826749974
Recall: 0.6545627566863669
F1 Score: 0.4930163251063348

Confusion Matrix:
[[129820  39963]
 [ 13794  26138]]

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.76      0.83    169783
           1       0.40      0.65      0.49     39932

    accuracy                           0.74    209715
   macro avg       0.65      0.71      0.66    209715
weighted avg       0.81      0.74      0.76    209715

