# Students Performance in Exams

This notebook is an exercise for handling categorical data and predicting with XGBRegressor.

# Load Data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')
data

In [None]:
data.info()

# Split Dataset

Split the dataset to two parts - training set and testing set, the training set is used for training and cross validating, the testing set is used for the final testing.

In [None]:
from sklearn.model_selection import train_test_split
from plotly.subplots import make_subplots
import plotly.graph_objects as go

score_cols = ['math score', 'reading score', 'writing score']
X = data.drop(score_cols, axis = 1)
y_math, y_reading, y_writing = data[score_cols[0]], data[score_cols[1]], data[score_cols[2]]

X_train_full, X_test_full, y_math_train, y_math_test = train_test_split(X, y_math, test_size = 0.2, random_state = 42)

y_reading_train = y_reading[X_train_full.index]
y_writing_train = y_writing[X_train_full.index]
y_reading_test = y_reading[X_test_full.index]
y_writing_test = y_writing[X_test_full.index]

# Build Models

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor

categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype == "object"]

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

pipeline = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols)
    ])

X_train = pipeline.fit_transform(X_train_full)
X_test = pipeline.transform(X_test_full)


def build(y_train, y_test):
    model = XGBRegressor(n_estimators = 1000, learning_rate = 0.01)
    model.fit(X_train, y_train, verbose = False)

    preds = model.predict(X_test)

    score = mean_absolute_error(y_test, preds)
    print('MAE: ', score)

print('predictions for math score:')
build(y_math_train, y_math_test)

print('predictions for reading score:')
build(y_reading_train, y_reading_test)

print('predictions for writing score:')
build(y_writing_train, y_writing_test)

# Conclusion

XGBRegressor is a potencial model for the further study.

# References

https://www.kaggle.com/alexisbcook/xgboost