In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import norm
import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv("../input/students-performance-in-exams/StudentsPerformance.csv")

In [None]:
print(len(train_df))
summary = pd.DataFrame(train_df.dtypes)
summary["Null"] = train_df.isnull().sum()
summary["first"] = train_df.loc[0]
summary["second"] = train_df.loc[1]
summary["third"] = train_df.loc[2]
summary["unique"] = train_df.nunique()
summary

We could see that there are feature which are object, like gender, race etc. We need to take care of them. Being object there are only few unique values and no null values. In total there are 1000 rows.

#### Feature Classification: Categorical and Numerical Features

In [None]:
cat_feats = train_df.select_dtypes("object").columns.to_list()
num_feats = [feats for feats in train_df.columns.to_list() if feats not in cat_feats]

#### Univariate Analysis of all features

In [None]:
def get_row_col_idx(idx):
    """Return the row index and  col index for plots"""
    row_idx = idx//2
    col_idx = idx%2
    return (row_idx, col_idx)

In [None]:
def write_percentage(train_df, ax):
    """Writes percentage on top of the bars on the plots"""
    for patches in ax.patches:
        height = patches.get_height()
        width = patches.get_width()
        x_loc = patches.get_x()
        values = height/len(train_df)*100
        ax.text(x = x_loc , y=height, s = '{:1.1f}%'.format(values), fontsize = 10 )

In [None]:
color = ["red", "blue", "green", "yellow", "orange", "pink", "purple", "brown"]
fig, ax = plt.subplots(4,2, figsize = (20,20))
for idx, cols in enumerate(train_df.columns.to_list()):
    row_idx, col_idx = get_row_col_idx(idx)
    col = color[idx]
    sns.histplot(x = cols, data = train_df, ax = ax[row_idx, col_idx], color = col )
    write_percentage(train_df, ax[row_idx, col_idx])

There are few obeservations which could be made from this chart:

1. We could see that there are more females in the dataset than the male gender. 
2. The percentage of group A people is least and of group C people is highest.
3. The population of people having parental education of master's degree is least in the dataset while people having parental education some degree and associate degree is max.
4. The count of people going for standard lunch is more that the free or reduced lunch.
5. People who finished their test prep course are less in number than the one's who did not finish.
6. People having math score of around 65-70 are more than the other ones.
7. People having reading score of around 70-75 are more than the other ones.
8. People having writing score of around 70-75 are more than the other ones.

In general we could say that the writing score, reading score and math score is normally distributed but they are skewed towards left.

#### Performance based on gender 

In [None]:
train_df.columns

In [None]:

fig, ax = plt.subplots(1,3,figsize = (20,6))
sns.barplot(y = "math score", x = "gender" ,data = train_df, palette='Greens', ax=ax[0])
sns.barplot(y = "reading score", x = "gender" ,data = train_df, palette='Greens', ax=ax[1])
sns.barplot(y = "writing score", x = "gender" ,data = train_df, palette='Greens', ax=ax[2])

From here we could see that males have scored better in Maths but females have done better than males in reading score and writing score.

In [None]:
# train_df["total marks"] = train_df["math score"] + train_df["reading score"] + train_df["writing score"]
fig, ax = plt.subplots(1,3,figsize = (20,6))
# for idx,val in enumerate([]):
#     row_idx, col_idx = get_row_col_idx(idx)
sns.barplot(y = "math score", x = "race/ethnicity" ,data = train_df, palette='Greys', ax=ax[0])
sns.barplot(y = "reading score", x = "race/ethnicity" ,data = train_df, palette='Greys', ax=ax[1])
sns.barplot(y = "writing score", x = "race/ethnicity" ,data = train_df, palette='Greys', ax=ax[2])

In [None]:

fig, ax = plt.subplots(1,3,figsize = (20,6))
sns.barplot(y = "math score", x = "race/ethnicity" ,data = train_df, palette='Greens', ax=ax[0], hue = "gender")
sns.barplot(y = "reading score", x = "race/ethnicity" ,data = train_df, palette='Greens', ax=ax[1], hue = "gender")
sns.barplot(y = "writing score", x = "race/ethnicity" ,data = train_df, palette='Greens', ax=ax[2], hue = "gender")


From this above chart it is evident that group E has performed well in all the tests, be it reading ,writing or maths. On the other hand group A has performed poorly in all the tests. and further we see the plots with respect to ethnicity and in that if we see gender wise performance, we could see that in reading and writing tests, in every group females perform better than males and for the math test, males performs better in each group.

In [None]:
fig, ax = plt.subplots(1,3, figsize = (20,5))
for idx, item in enumerate(train_df.columns[-3:]):
    sns.boxplot(x=train_df["lunch"], y=train_df[item], ax = ax[idx])

Above plots tells us that people who are having standard lunch, they will perform better than the ones who did not have standard lunch and rather have reduced/free lunch.

In [None]:
fig, ax = plt.subplots(1,3, figsize = (20,5))
for idx, item in enumerate(train_df.columns[-3:]):
    sns.boxplot(x=train_df["test preparation course"], y=train_df[item], ax = ax[idx])

Above plots give us a idea that people who are more likely to finish their test, they will perform better than the ones which did not finish their prepration and that makes sense.

#### KDE plot for numerical data

In [None]:
sns.kdeplot(data=train_df[num_feats],shade=True)


We can see the skewness in the plot. It is only skewed towards left. It is not highly skewed though.

In [None]:
sns.pairplot(hue = "gender", data = train_df )

Following observation could be made from the above pair plot:
1. All the scores are increasing linearly with each other and they are increasing with the increase in other score.
2. In the plot between math score and other scores, we can also see that gender is seperating the dataset.
3. Most of the score lie betwee 50-100.

In [None]:
train_df["total marks"] = train_df["math score"] + train_df["reading score"] + train_df["writing score"]
fig, ax = plt.subplots(2,2,figsize = (20,10))
for idx,val in enumerate(["gender", "race/ethnicity", "lunch", 'test preparation course' ]):
    row_idx, col_idx = get_row_col_idx(idx)
    sns.boxplot(x = val, y = 'total marks',data = train_df, palette='Set3', ax=ax[row_idx, col_idx])

In [None]:
plt.figure(figsize=(20,5))
sns.boxplot(x = "parental level of education", y = "total marks", data = train_df)

From this above plot we could see that the people with their parental level of education having master's degree have performed better than others.Also, people with their parental level of education having high school degree have performed poorly. We could use label encoder to encode the categorical variables

### Model Training

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

##### Getting Target

In [None]:
target = train_df.pop("total marks")
train_df = pd.get_dummies(train_df[cat_feats])

##### Data Splitting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_df, target, random_state = 0 )

In [None]:
target.shape
# X_test.shape

### Linear Model

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)
mse = mean_squared_error(lm.predict(X_test), y_test)
r2 = r2_score(lm.predict(X_test), y_test)
print(mse,r2)

In [None]:
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
mse = mean_squared_error(dt.predict(X_test), y_test)
r2_score = r2_score(dt.predict(X_test), y_test)
print(mse, r2_score)

### Conclusion 
Here is few points at the end:
1. Parents' education level may affect the performance of students, but not the important one.
1. Finishing preparation course is beneficial.
1. Having lunch is important to students, and it is also the most significant one.
1. Gender has no correlation with the score.

Females perform better in reading and writing scores so males, get some insights from them and females need to better in math test so they can ask males. All in all, if students want to have good performance, they should have enough nutrient and make effort to prepare the test.

### Looking forward for your comments so I could imporve! Any feedback will be appeciated!