# XGBoost Regression Assignment - Student Marks

---



In [1]:
import pandas as pd

# Read data from CSV
df = pd.read_csv('/content/Student_Marks.csv')
df.tail()

Unnamed: 0,number_courses,time_study,Marks
95,6,3.561,19.128
96,3,0.301,5.609
97,4,7.163,41.444
98,7,0.309,12.027
99,3,6.335,32.357


In [2]:
# Number of rows and columns
df.shape

(100, 3)

In [3]:
# Summary of the data: column names, total no.of non-null values, data types, memory usage
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   number_courses  100 non-null    int64  
 1   time_study      100 non-null    float64
 2   Marks           100 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 2.5 KB


In [4]:
# Summary statistics
df.describe()

Unnamed: 0,number_courses,time_study,Marks
count,100.0,100.0,100.0
mean,5.29,4.07714,24.41769
std,1.799523,2.372914,14.326199
min,3.0,0.096,5.609
25%,4.0,2.0585,12.633
50%,5.0,4.022,20.0595
75%,7.0,6.17925,36.67625
max,8.0,7.957,55.299


In [5]:
# Check for missing values
df.isna().sum()

number_courses    0
time_study        0
Marks             0
dtype: int64

In [6]:
# Check for duplicate values in the dataset
df[df.duplicated()].sum()

number_courses    0.0
time_study        0.0
Marks             0.0
dtype: float64

In [7]:
# Spliting the feature and target values
X = df.drop(columns=['Marks'])
y = df['Marks']


In [8]:
# 'train_test_split' to split the dataset into Training & Test Data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
X_train

Unnamed: 0,number_courses,time_study
55,7,2.913
88,5,1.803
26,7,0.508
42,6,3.591
69,6,3.948
...,...,...
60,6,0.376
71,5,2.518
14,3,2.908
92,4,5.027


In [10]:
# Importing 'StandardScaler' to rescale the data
from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
scaler = StandardScaler()

# fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# transform the testing data
X_test_scaled = scaler.transform(X_test)

In [11]:
# Importing 'xgboost' and 'XGBRegressor'
import xgboost as xgb
xgb_regressor = xgb.XGBRegressor(random_state=42)

In [12]:
# Importing `GridSearchCV` for Hyperparameter tuning - search for the best combination of hyperparameters
from sklearn.model_selection import GridSearchCV

# Define hyperparameters for grid search
param_grid = {
    'n_estimators': [50, 75, 100, 150, 200, 250, 300],    # Number of trees (estimators)
    'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15, 0.2],  # Learning rate
    'max_depth': [2, 3, 4, 5, 6]                          # Maximum tree depth
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_regressor, param_grid=param_grid,
                           scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)

# Perform hyperparameter tuning
grid_search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 210 candidates, totalling 1050 fits


In [13]:
# Get the best hyperparameters
best_params = grid_search.best_params_

In [14]:
# Print the best parameters
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 300}


In [15]:
# Initialize XGBoost regressor with the best hyperparameters
best_xgb_regressor = xgb.XGBRegressor(**best_params, random_state=42) # Unpack `best_params` parameters

In [16]:
# Train the model with the best hyperparameters
best_xgb_regressor.fit(X_train_scaled, y_train)


In [17]:
# Make predictions on the training and testing set
y_train_pred = best_xgb_regressor.predict(X_train_scaled)
y_test_pred = best_xgb_regressor.predict(X_test_scaled)


In [18]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Evaluate the model - Training data
train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)


In [19]:
# Evaluate the model - Test data
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)


In [20]:
# Print evaluation metrics  - Training data
print("Training Mean Squared Error:", train_mse)
print("Training Mean Absolute Error:", train_mae)
print("Training R-squared:", train_r2*100,'%')

Training Mean Squared Error: 0.03835063613128423
Training Mean Absolute Error: 0.14970647253990185
Training R-squared: 99.97895337124505 %


In [21]:
# Print evaluation metrics  - Test data
print("Testing Mean Squared Error:", test_mse)
print("Testing Mean Absolute Error:", test_mae)
print(f"Testing R-squared: {test_r2*100:.2f}%")

Testing Mean Squared Error: 2.220436314607347
Testing Mean Absolute Error: 1.1938574165344236
Testing R-squared: 99.16%
