In [None]:
import pandas as pd
import numpy as np
# visualization, plot
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# preprocessing
from sklearn.model_selection import train_test_split

# machine learning
import lightgbm as lgb


from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
df = pd.read_csv("../input/students-performance-in-exams/StudentsPerformance.csv")
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()
# there is no null variables in dataset

**DATA VISULIZATION**

**GENDER COUNT PIE CHART**

In [None]:
px.pie(df['gender'].value_counts(), values='gender', names=pd.Index(['female', 'male']), template='ggplot2')

**Math, reading, writing scores for gender**

We can see that, females have more score in writing and reading, males have more score in math but there is no big difference like reading and writing

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(30,12))
fig.suptitle('Scores for gender')


for i, score in enumerate(['math score', 'reading score', 'writing score']):

    sns.histplot(ax=axes[i], x=score, data=df, hue='gender')
    axes[i].set_title(score)
    plt.setp(axes[i].get_legend().get_texts(), fontsize='15')
    plt.setp(axes[i].get_legend().get_title(), fontsize='16') 


**PIE CHART FOR RACE/ETCHNICITY**

In [None]:
px.pie(df['race/ethnicity'].value_counts(), values='race/ethnicity', names=df['race/ethnicity'].value_counts().index, template='ggplot2')

In [None]:
# PLOTS KDE FOR MATH, READING AND WRITING IN GIVEN HUE
def kde_plot_all_scores(title_name, data, hue):
    fig, axes = plt.subplots(1, 3, figsize=(24,6))
    fig.suptitle(title_name)

    scores = ['math score', 'reading score', 'writing score']
    for i, score in enumerate(scores):
        sns.kdeplot(ax=axes[i], x=score, data=data, hue=hue, linewidth=2, fill=True, alpha=.2)
        axes[i].set_title(score)  
        plt.setp(axes[i].get_legend().get_texts(), fontsize='11')
        plt.setp(axes[i].get_legend().get_title(), fontsize='13') 

**KDE PLOT FOR RACE/ETHNICITY FOR ALL SCORES**

In [None]:
kde_plot_all_scores('Scores for gender', df, 'race/ethnicity')

**Parental level of education pie chart**

In [None]:
px.pie(df['parental level of education'].value_counts(), values='parental level of education', names=df['parental level of education'].value_counts().index, template='ggplot2')

In [None]:
#kde plots for parental level od education for all scores
fig, axes = plt.subplots(3, 1, figsize=(18,12))
fig.suptitle('Scores for gender')

scores = ['math score', 'reading score', 'writing score']
for i, score in enumerate(scores):
    sns.kdeplot(ax=axes[i], x=score, data=df, hue='parental level of education', linewidth=2, fill=True, alpha=.2)
    axes[i].set_title(score)
    plt.setp(axes[i].get_legend().get_texts(), fontsize='11')
    plt.setp(axes[i].get_legend().get_title(), fontsize='11')
    plt.tight_layout()

In [None]:
df['lunch'].value_counts()

**KDE PLOT FOR LUNCH**

It seems that students who take standard lunch more successful

In [None]:
kde_plot_all_scores('Scores for lunch', df, 'lunch')

**PIE CHART FOR TEST PREPARATION COURSE**

In [None]:
px.pie(df['test preparation course'].value_counts(), values='test preparation course', names=df['test preparation course'].value_counts().index, template='ggplot2')

In [None]:
kde_plot_all_scores('Scores for test preparation course', df, 'test preparation course')

In [None]:
# add category columns for all columns except target columns(math, reading and writing scores)
df = pd.get_dummies(df, columns=['gender','race/ethnicity', 'parental level of education', 'lunch', 'test preparation course'],
              prefix=['gender','race/ethnicity', 'parent_education', 'lunch', 'course'], drop_first=True)

In [None]:
df.head()

In [None]:
# Average score for target column, then drop the scores
df['avg_score'] =  df[['math score', 'reading score', 'writing score']].apply(np.mean, axis=1)
df['avg_score'] = df['avg_score'].apply(lambda x : int(x))
df.drop(['math score', 'reading score', 'writing score'], inplace=True, axis=1)

In [None]:
X = df.drop(['avg_score'], axis=1)
y = df['avg_score']

In [None]:
# split data to train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

**LightGBM**

In [None]:
# to train the model first we need to convert to lgb dataset
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test)

In [None]:
# parameters for lgb
parameters = {
    'objective': 'regression',
    'metric': 'mse',#mean squared error
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 63,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.01,
    'verbose': -1
}

In [None]:
model_lgbm = lgb.train(parameters,
                           train_data,
                           valid_sets=valid_data,
                           num_boost_round=1000,
                           early_stopping_rounds=50)

In [None]:
lgb.plot_importance(model_lgbm, figsize=(24, 12))

In [None]:
predictions = model_lgbm.predict(X_test)

In [None]:
print('Mean squared error:', mean_squared_error(y_test, predictions))
print('Root Mean squared error:', np.sqrt(mean_squared_error(y_test, predictions)))
print('Mean absolute error:', mean_absolute_error(y_test, predictions))