In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
# Load the data
years = ['2016', '2017', '2018', '2019', '2020', '2021', '2022']
dataframes = [pd.read_csv(f"EngineeringRanking_{year}.csv") for year in years]
data = pd.concat(dataframes)

In [3]:
# Combine data with EngineeringRanking.csv
combined_data = pd.read_csv("EngineeringRanking.csv")
data = pd.concat([data, combined_data])

In [4]:
# Handling missing values by filling with 0
data.fillna(0, inplace=True)

In [5]:
# Convert non-numeric columns to numeric types
numeric_columns = ['Score', 'Rank', 'TLR', 'RPC', 'GO', 'OI', 'Perception']
data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric, errors='coerce')

In [6]:
# Remove rows with missing target variable (Rank_21)
data.dropna(subset=['Rank_21'], inplace=True)

In [7]:
# Feature selection
features = ['Score_21', 'TLR_21', 'RPC_21', 'GO_21', 'OI_21']
X = data[features]
y = data['Rank_21']

In [8]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Initialize and train the XGBoost model
model = XGBRegressor()
model.fit(X_train, y_train)

In [10]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [11]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [12]:
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

Mean Squared Error: 0.6550020040254699
Mean Absolute Error: 0.22310798380680352
R-squared: 0.9995662980351128
