In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import uniform


from sklearn.linear_model import LinearRegression
import xgboost as xgb


from sklearn.metrics import r2_score
import os

## Model Evaluation Function

In [2]:
def eval_r2(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    
    # Evaluate the model
    train_r2 = r2_score(y_train, train_preds)
    test_r2 = r2_score(y_test, test_preds)
    # Calculate residuals
    
    train_residuals = y_train - train_preds
    test_residuals = y_test - test_preds

    print(f"Train R² Score: {train_r2}")
    print(f"Test R² Score: {test_r2}")
    
    plt.figure(figsize=(14, 6))

    plt.subplot(1, 2, 1)
    sns.scatterplot(x=train_preds, y=train_residuals, alpha=0.6)
    plt.axhline(0, color='r', linestyle='--')
    plt.title('Train Residuals')
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')

    plt.subplot(1, 2, 2)
    sns.scatterplot(x=test_preds, y=test_residuals, alpha=0.6)
    plt.axhline(0, color='r', linestyle='--')
    plt.title('Test Residuals')
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')

    plt.tight_layout()
    plt.show()
    return

## Read in Data

In [12]:
# Pull thhe data from csv to 
train_full = pd.read_csv(os.getcwd() + '/data/train_transformed_full.csv', index_col=False)
test_full = pd.read_csv(os.getcwd() + '/data/test_transformed_full.csv', index_col=False)

val_size = 2000

train_full = train_full.sample(frac=0.1, random_state=42)  # 10% random sample

X_train_full = train_full.drop(columns=['target'])
y_train_full = train_full['target']

X_train = X_train_full.iloc[:-val_size]
y_train = y_train_full.iloc[:-val_size]

X_val = X_train_full.iloc[-val_size:]
y_val = y_train_full.iloc[-val_size:]

X_test = test_full.drop(columns=['target'])
y_test = test_full['target']

In [4]:
X_train

Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors
192546,1.984384,0.512315,-0.943256,-0.946757,0.987232,-0.939506,0.980413,-1.416635,-1.422389,0.983434,-2.378993,0.025557,-0.935881,-0.449136,-1.406590,2.457132,0.517211,0.990498,0.028849,1.464737
652191,0.038458,-0.442919,0.504319,-1.922014,-0.932567,-1.425430,1.460238,0.518760,-1.422389,0.023609,-1.417072,0.507932,0.021481,0.994313,-1.406590,0.036442,0.999459,-0.942464,0.028849,0.029574
130594,1.011421,1.467550,-1.425781,1.491385,-0.932567,1.976039,1.940062,0.034911,-1.422389,-0.936216,0.506768,-0.939192,-1.414562,2.918913,0.034036,1.488856,0.517211,-0.459224,-1.412635,-0.448814
203963,0.524939,0.512315,0.021794,1.979013,-0.932567,1.004191,1.460238,1.970306,1.960977,0.983434,-0.455152,0.025557,-0.457200,-0.449136,0.514245,-0.447696,-1.411782,-0.942464,0.028849,-1.883977
860681,-0.448024,0.034698,-0.943256,0.028499,1.467181,-0.453582,0.500589,-1.416635,-0.455713,0.503521,-0.455152,0.507932,-0.457200,0.513163,-1.886799,0.520580,-1.411782,1.956979,-0.932141,-0.448814
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483451,-0.448024,0.034698,0.021794,0.028499,-0.452618,1.004191,-0.459059,0.518760,1.477639,-0.456304,-0.936112,0.025557,-0.935881,-1.411436,-1.406590,0.520580,0.034963,-0.942464,-0.451646,0.029574
128712,-0.448024,1.945167,-0.460731,0.516128,0.027332,0.518267,0.500589,-0.932786,-0.939051,-0.456304,0.025808,0.507932,-1.414562,1.475463,-0.926381,0.036442,-2.376278,-0.459224,-1.412635,0.986349
295133,0.524939,0.512315,-0.460731,1.979013,0.987232,-0.939506,-1.418707,-0.932786,0.510963,0.023609,0.025808,-1.903941,0.021481,0.032014,-1.406590,0.520580,0.517211,0.507258,0.028849,1.943125
664057,-0.934506,-0.442919,-0.460731,-0.459129,1.467181,-0.453582,-0.459059,-0.448938,-0.455713,0.023609,0.025808,0.025557,0.021481,-0.930286,-1.406590,0.520580,0.517211,0.024017,-0.932141,0.986349


In [5]:

model = xgb.XGBRegressor(tree_method='gpu_hist')

In [6]:
import numpy as np
import pandas as pd

def check_inf_values(*datasets):
    for i, dataset in enumerate(datasets):
        if isinstance(dataset, pd.DataFrame) or isinstance(dataset, pd.Series):
            inf_mask = dataset.isin([np.inf, -np.inf]).any()
            if inf_mask.any():
                print(f"Dataset {i} contains inf values at columns: {inf_mask[inf_mask].index.tolist()}")
            else:
                print(f"Dataset {i} contains no inf values.")
        else:
            inf_mask = np.isinf(dataset).any()
            if inf_mask:
                print(f"Dataset {i} contains inf values.")
            else:
                print(f"Dataset {i} contains no inf values.")

# Example usage
check_inf_values(X_train, X_test, y_train, y_test)


Dataset 0 contains no inf values.
Dataset 1 contains no inf values.
Dataset 2 contains no inf values.
Dataset 3 contains no inf values.


In [10]:
import psutil
print(f"Memory before fitting: {psutil.virtual_memory().percent}%")

Memory before fitting: 69.1%


In [11]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [7]:
import psutil
print(f"Memory before fitting: {psutil.virtual_memory().percent}%")
model.fit(X_train, y_train)
print(f"Memory after fitting: {psutil.virtual_memory().percent}%")


: 

In [None]:
model.fit(X_train, y_train)

: 

## Linear Regression

In [None]:
linreg_model = LinearRegression()
eval_r2(model=linreg_model,
        X_train=X_train,
        X_test=X_test,
        y_train=y_train,
        y_test=y_test, 
        )

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score
import xgboost as xgb

def eval_r2(model, X_train, X_test, y_train, y_test):
    print("Starting model fitting...")
    model.fit(X_train, y_train)
    print("Model fitting done.")
    
    print("Starting prediction...")
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    print("Prediction done.")
    
    print("Evaluating R² score...")
    train_r2 = r2_score(y_train, train_preds)
    test_r2 = r2_score(y_test, test_preds)
    print(f"Train R² Score: {train_r2}")
    print(f"Test R² Score: {test_r2}")
    
    train_residuals = y_train - train_preds
    test_residuals = y_test - test_preds

    print("Starting plotting...")
    plt.figure(figsize=(14, 6))

    plt.subplot(1, 2, 1)
    sns.scatterplot(x=train_preds, y=train_residuals, alpha=0.6)
    plt.axhline(0, color='r', linestyle='--')
    plt.title('Train Residuals')
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')

    plt.subplot(1, 2, 2)
    sns.scatterplot(x=test_preds, y=test_residuals, alpha=0.6)
    plt.axhline(0, color='r', linestyle='--')
    plt.title('Test Residuals')
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')

    plt.tight_layout()
    plt.show()
    print("Plotting done.")

xgb_model = xgb.XGBRegressor()

# Example data shapes (replace with actual data)
# X_train, X_test, y_train, y_test = some_data_loading_function()

eval_r2(model=xgb_model, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)


In [None]:
xgb_model = xgb.XGBRegressor()

eval_r2(model=xgb_model,
        X_train=X_train,
        X_test=X_test,
        y_train=y_train,
        y_test=y_test, 
        )