In [1]:
from google.colab import files
uploaded = files.upload()

Saving 2024_military_strength_by_country.csv to 2024_military_strength_by_country.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

# Load the dataset
df = pd.read_csv('2024_military_strength_by_country.csv')

# Drop rows with any missing values
df = df.dropna()

# Select the features and the target variable
# Features (X)
X_cols = [
    'active_service_military_manpower',
    'total_military_aircraft_strength',
    'total_combat_tank_strength'
]
# Target (y)
y_col = 'national_annual_defense_budgets'

# Clean the columns by removing non-numeric characters and converting to float
for col in X_cols + [y_col]:
    # Replace commas and other non-numeric characters, then convert to numeric
    df[col] = df[col].astype(str).str.replace(r'[^\d.]', '', regex=True)
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop any rows with NaN values resulting from the conversion
df.dropna(subset=X_cols + [y_col], inplace=True)

# Define X and y
X = df[X_cols]
y = df[y_col]

#-------------------------------------------------------------------------------
# Calculation BEFORE K-Fold Cross-Validation
#-------------------------------------------------------------------------------
print('------------------------------------------------------------')
print('BIAS AND VARIANCE BEFORE K-FOLD CROSS-VALIDATION')
print('------------------------------------------------------------')

# Train a linear regression model on the entire dataset
model_before = LinearRegression()
model_before.fit(X, y)
y_pred_before = model_before.predict(X)
residuals_before = y_pred_before - y

# Calculate bias and variance
bias_before = np.mean(residuals_before)
variance_before = np.var(residuals_before)

print(f'Bias (on full dataset): {bias_before:.2f}')
print(f'Variance (on full dataset): {variance_before:.2f}')
print('------------------------------------------------------------\n')

#-------------------------------------------------------------------------------
# Calculation AFTER K-Fold Cross-Validation
#-------------------------------------------------------------------------------
print('------------------------------------------------------------')
print('BIAS AND VARIANCE AFTER K-FOLD CROSS-VALIDATION')
print('------------------------------------------------------------')

# Initialize K-Fold with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store metrics for each fold
biases = []
variances = []
fold_number = 1

# Loop through each fold
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and train the model
    model_after = LinearRegression()
    model_after.fit(X_train, y_train)
    y_pred_after = model_after.predict(X_test)
    residuals_after = y_pred_after - y_test

    # Calculate bias and variance for the current fold
    fold_bias = np.mean(residuals_after)
    fold_variance = np.var(residuals_after)

    biases.append(fold_bias)
    variances.append(fold_variance)

    # Print the results for the current fold
    print(f'Fold {fold_number}:')
    print(f'  Bias: {fold_bias:.2f}')
    print(f'  Variance: {fold_variance:.2f}')
    print('------------------------------------------------------------')
    fold_number += 1

# Calculate the average bias and variance across all folds
avg_bias = np.mean(biases)
avg_variance = np.mean(variances)

print(f'Average Bias across all folds: {avg_bias:.2f}')
print(f'Average Variance across all folds: {avg_variance:.2f}')

------------------------------------------------------------
BIAS AND VARIANCE BEFORE K-FOLD CROSS-VALIDATION
------------------------------------------------------------
Bias (on full dataset): 0.00
Variance (on full dataset): 123035491605193867264.00
------------------------------------------------------------

------------------------------------------------------------
BIAS AND VARIANCE AFTER K-FOLD CROSS-VALIDATION
------------------------------------------------------------
Fold 1:
  Bias: 163776149.73
  Variance: 130507862788552458240.00
------------------------------------------------------------
Fold 2:
  Bias: -1542034770.68
  Variance: 243761830094922285056.00
------------------------------------------------------------
Fold 3:
  Bias: -585192755.56
  Variance: 115150027631896641536.00
------------------------------------------------------------
Fold 4:
  Bias: -4873745201.06
  Variance: 646631814367480119296.00
------------------------------------------------------------
Fo