## Environment setup

In [None]:

import numpy as np
import pandas as pd

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

# Modelling 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

import warnings


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Preparation

In [None]:
data = pd.read_csv("/kaggle/input/students-performance-in-exams/StudentsPerformance.csv")
data.head()

### Identification of data types and size

In [None]:
print(data.dtypes)
print(data.shape)

In [None]:
# checking the number of unique entries in each column.
data.nunique()

In [None]:
categorical_vars = ['gender','race/ethnicity','parental level of education','lunch','test preparation course']
numerical_vars = ['math score','reading score','writing score']

### Statistical summary of numeric variables

In [None]:
data[numerical_vars].describe().transpose()

### Graphical univariate analysis of scores

#### Histograms (aka countplots)

In [None]:
fig,ax = plt.subplots(1,3, figsize = (12,6))
sns.histplot(data, x = 'math score', bins = 20, ax = ax[0])
sns.histplot(data, x = 'reading score', bins = 20, ax = ax[1])   
sns.histplot(data, x = 'writing score', bins = 20, ax = ax[2])

### Boxplots

In [None]:
fig,ax = plt.subplots(1,3, figsize = (20,6))
sns.boxplot( x = data['math score'], ax = ax[0])
sns.boxplot(x = data['reading score'], ax = ax[1])   
sns.boxplot(x = data['writing score'], ax = ax[2])

### Visualizing scores of separate categories

In [None]:
# fig,ax = plt.subplots(1,2, figsize = (12,6))
sns.catplot(data = data, y = 'math score', x = 'gender')

In [None]:
sns.boxplot(data = data, x = 'math score', y = 'race/ethnicity')

In [None]:
sns.countplot(data = data,x = 'race/ethnicity')

In [None]:
sns.countplot(data = data,x = 'race/ethnicity', hue = 'gender')

In [None]:
rp = sns.catplot(data = data[data['race/ethnicity'] == 'group A'],x = 'math score', y = 'gender')
rp.fig.suptitle('Group A Math score by Gender')

In [None]:
rp = sns.catplot(data = data[data['race/ethnicity'] == 'group A'],y = 'reading score', x = 'test preparation course')
rp.fig.suptitle('Group A reading by prep course')

In [None]:
sns.countplot(data = data, x = 'race/ethnicity', hue = 'lunch')