In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings("ignore")
data=pd.read_csv("../input/students-performance-in-exams/StudentsPerformance.csv")
data.head()

In [None]:
data.lunch.unique()

In [None]:
data.info()

In [None]:
df=data.copy()
df.nunique()

* For the columns with two components, **LabelEncoder****** is suitable.
* For the columns with more than two components **OneHotEncoder**** or **get_dummies ****are suitable.(due to "dummy variable trap")

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df.gender=le.fit_transform(df.gender)
df["test preparation course"]=le.fit_transform(df["test preparation course"])

In [None]:
df["lunch"]=le.fit_transform(df["lunch"])

In [None]:
df.head()

In [None]:
df=pd.get_dummies(df, columns=["race/ethnicity","parental level of education"],drop_first=True)

In [None]:
parameters=df.drop(["math score", "reading score", "writing score"], axis=1)
results=df[["math score", "reading score", "writing score"]]

In [None]:
results["avg_score"]=results.mean(axis=1)
results.head()

In [None]:
import matplotlib.pyplot as plt
parameters_res=parameters.copy()
parameters_res["avg_score"]=results.avg_score.copy()
import seaborn as sns

parameters_res.groupby("gender")["avg_score"].mean().plot(kind="barh")
plt.yticks(ticks=[0,1],labels=["female","male"]);


Seems like females are a bit more successful than males. Lets look more deeply.

In [None]:
plt.figure(figsize=(15,5))
sns.boxplot(x="test preparation course", y="avg_score", hue="gender", data=parameters_res)
L=plt.legend()
L.get_texts()[0].set_text('female')
L.get_texts()[1].set_text('male')
plt.xticks(ticks=[0,1],labels=["Yes","No"]);

Seems like courses enhance the test performance. Let's look at the impact of the other parameters.

In [None]:
plt.figure(figsize=(15,5))
sns.boxplot(x="lunch", y="avg_score", hue="gender", data=parameters_res)
L=plt.legend()
L.get_texts()[0].set_text('female')
L.get_texts()[1].set_text('male')
plt.xticks(ticks=[0,1],labels=["free/reduced","standard"]);
plt.show()

Lunch has a positive correlation with test performance.

In [None]:
plt.figure(figsize=(15,5))
sns.boxplot(x="race/ethnicity_group B", y="avg_score", hue="gender", data=parameters_res)
L=plt.legend()
L.get_texts()[0].set_text('female')
L.get_texts()[1].set_text('male')
plt.xticks(ticks=[0,1],labels=["No","Yes"]);

In [None]:
plt.figure(figsize=(15,5))
sns.boxplot(x="race/ethnicity_group C", y="avg_score", hue="gender", data=parameters_res)
L=plt.legend()
L.get_texts()[0].set_text('female')
L.get_texts()[1].set_text('male')
plt.xticks(ticks=[0,1],labels=["No","Yes"]);

In [None]:
plt.figure(figsize=(15,5))
sns.boxplot(x="race/ethnicity_group D", y="avg_score", hue="gender", data=parameters_res)
L=plt.legend()
L.get_texts()[0].set_text('female')
L.get_texts()[1].set_text('male')
plt.xticks(ticks=[0,1],labels=["No","Yes"]);

* Race and ethnic group have no impact on test performance.
* Independent from other parameters, females are definetely more successful. However, almost every boxplot shows that there are more outlier data in women than men (whis=1.5 case). To make more accurate modelling, the ones having most extreme values should be pressed.

In [None]:
max_val_men=parameters_res.groupby("gender")["avg_score"].quantile(0.98)[1]
min_val_men=parameters_res.groupby("gender")["avg_score"].quantile(0.02)[1]
max_val_women=parameters_res.groupby("gender")["avg_score"].quantile(0.98)[0]
min_val_women=parameters_res.groupby("gender")["avg_score"].quantile(0.02)[0]
female=parameters_res[parameters_res.gender==0]
male=parameters_res[parameters_res.gender==1]


In [None]:
female[female["avg_score"]<min_val_women]=min_val_women
male[male["avg_score"]<=min_val_women]=min_val_men    

In [None]:
female[female["avg_score"]>max_val_women]=max_val_women
male[male["avg_score"]>max_val_women]=max_val_men    

Previously

In [None]:
sns.boxplot(x=parameters_res.gender[parameters_res.gender==0],y=parameters_res["avg_score"]);

After

In [None]:
sns.boxplot(y="avg_score", data=female);

Much better.

In [None]:
new_df=pd.concat([female,male])

In [None]:
new_df.head()

# 2. Modelling

# 2.1.LinearRegression

In [None]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
X=new_df.drop("avg_score",axis=1)
y=new_df.avg_score
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.2,random_state=42)
le=LinearRegression()
le.fit(X_train,y_train)
y_pred1=le.predict(X_test)
mean_absolute_error(y_pred1, y_test)

 # 2.2.1. RF Model with Default Parameters

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor()
rf.fit(X_train,y_train)
y_pred2=rf.predict(X_test)
mean_absolute_error(y_pred2, y_test)

# 2.2.2. RF Model with Optimized Parameters

In [None]:
rf_parameters={"n_estimators":[100,200,500,750],
              "max_depth":[3,5,7],
              "min_samples_split":[2,4,3,5]}
rf_cv=GridSearchCV(RandomForestRegressor(),rf_parameters,cv=3, n_jobs=-1)
rf_cv.fit(X_train,y_train)

In [None]:
rf_cv.best_params_

In [None]:
rf_tuned=RandomForestRegressor(max_depth= 3, min_samples_split= 4,n_estimators= 750).fit(X_train,y_train)
y_pred21=rf_tuned.predict(X_test)
mean_absolute_error(y_pred21,y_test)

# 2.3.1. LGBM Model with Default Parameters

In [None]:
from lightgbm import LGBMRegressor
lgbm_model=LGBMRegressor().fit(X_train,y_train)
y_pred3=lgbm_model.predict(X_test)
mean_absolute_error(y_pred3,y_test)

# 2.3.2. LGBM Model with Optimized Parameters

In [None]:
lg_parameters={"n_estimators":[100,200,500,750],
              "learning_rate":[0.1,0.01,0.5,0.2],
              "min_child_weight":[0.001,0.002,0.003,0.004]}
lgbm_cv=GridSearchCV(LGBMRegressor(),lg_parameters,cv=3,n_jobs=-1)
lgbm_cv.fit(X_train,y_train)

In [None]:
lgbm_cv.best_params_

In [None]:
lgbm_tuned=LGBMRegressor(learning_rate= 0.01, min_child_weight= 0.001, n_estimators= 200)
lgbm_tuned.fit(X_train,y_train)
ypred4=lgbm_tuned.predict(X_test)
mean_absolute_error(ypred4,y_test)

# 3. Comparison

In [None]:
comp=pd.DataFrame({"Models":["LinReg", "RF", "LGBM"],"Error":[10.3,10.31,10.8]})
comp

Linear Regression Model is the most successful one.

In [None]:
le.coef_

In [None]:
le.intercept_

# 4. Conclusion

* Eating lunch and amount of it and going a test preparation course have some impact on test performance. On the other hand, race and ethniticy have no impact on test performance. Nothing impressive was found.
* In overall, females seems more successful than males on tests.
* Females have more outliers than man. It shows the deviation in performance is bigger in female group.
* The parameters in dataset is not enough to predict human performance. But if it is necessary, Linear Regression predicts the most appropriate predictions.
