In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [8]:
def load_data():
    balance_data = pd.read_excel("Updated file AQI.xlsx")
    return balance_data

In [14]:
# Preprocess data
def preprocess_data(df, features, target):
    df = df[features + [target]].dropna()
    correlation_matrix = df.select_dtypes(include=[np.number]).corr()
    print("Feature Correlation with Target:\n", correlation_matrix[target])
    
    X = df[features]
    y = df[target]
    return train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [15]:
# Create preprocessing pipeline
def create_pipeline(categorical_features, numerical_features, model_type='random_forest'):
    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])
    
    if model_type == 'random_forest':
        model = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42))
        ])
    else:
        model = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', LinearRegression())
        ])
    
    return model

In [16]:
# Train model using cross-validation
def train_and_evaluate_model(model, X, y):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
    print(f'Cross-validated R2 Scores: {scores}')
    print(f'Mean R2 Score: {scores.mean()}')
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    print(f'Final Test RMSE: {rmse}')
    print(f'Final Test R2 Score: {r2}')
    
    return model

In [17]:
# Predict function to preprocess user input
def predict_hospitals(model, features):
    state = input("Enter State: ")
    aqi = float(input("Enter AQI: "))
    no2 = float(input("Enter NO2: "))
    so2 = float(input("Enter SO2: "))
    pm10 = float(input("Enter PM10: "))
    pm2_5 = float(input("Enter PM2.5: "))
    
    input_data = pd.DataFrame([[state, aqi, no2, so2, pm10, pm2_5]], columns=features)
    
    # Ensure preprocessing pipeline is applied
    input_data_transformed = model.named_steps['preprocessor'].transform(input_data)
    
    predicted_hospitals = model.named_steps['regressor'].predict(input_data_transformed)
    print(f'Predicted Number of Hospitals Needed: {round(predicted_hospitals[0])}')

In [20]:
# Main execution
if __name__ == "__main__":
    sheet_name = "Updated file AQI.xlsx"
    
    features = ['State', 'AQI', 'NO2', 'SO2', 'PM10', 'PM2.5']
    target = 'Number of Hospitals2'
    
    df = load_data()
    X_train, X_test, y_train, y_test = preprocess_data(df, features, target)
    
    categorical_features = ['State']
    numerical_features = ['AQI', 'NO2', 'SO2', 'PM10', 'PM2.5']
    
    model = create_pipeline(categorical_features, numerical_features, model_type='random_forest')
    model = train_and_evaluate_model(model, X_train, y_train)
    
    # Predict hospitals for user input
    predict_hospitals(model, features)

Feature Correlation with Target:
 AQI                    -0.043064
NO2                     0.064708
SO2                     0.062281
PM10                    0.118823
PM2.5                  -0.005632
Number of Hospitals2    1.000000
Name: Number of Hospitals2, dtype: float64
Cross-validated R2 Scores: [0.90774338 0.90384656 0.88588366 0.89216851 0.91593532]
Mean R2 Score: 0.9011154849533144
Final Test RMSE: 369.0841063932672
Final Test R2 Score: 0.9071190607391244


Enter State:  Bihar
Enter AQI:  220
Enter NO2:  45
Enter SO2:  7.8
Enter PM10:  350
Enter PM2.5:  160


Predicted Number of Hospitals Needed: 1335
