In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
base = pd.read_csv('../input/students-performance-in-exams/StudentsPerformance.csv')

In [None]:
#Importing Libraries

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#Verifying if there is any missing values

base.isna().sum()

In [None]:
base.rename(columns = {'race/ethnicity': 'race', 'parental level of education': 'parents_level_education', 'test preparation course': 'course', 'math score': 'math', 'reading score': 'reading', 'writing score': 'writing'}, inplace = True)

In [None]:
#Creating new variables

base['total_score'] = base['math'] + base['reading'] + base['writing']
base['mean_score'] = base['total_score'] / 3

In [None]:
base.head()

In [None]:
sns.countplot(base.gender)
plt.xlabel('GENDER')
plt.ylabel('QUANTITY')

We have more women than men in the dataset.

In [None]:
mean_score_female = base[(base.gender == 'female')]
mean_score_male = base[(base.gender == 'male')]

In [None]:
plt.figure(figsize = (12,8))
sns.barplot(x = mean_score_female.course, y= mean_score_female.mean_score, hue = mean_score_female.race)
plt.xlabel('PREPARATION COURSE')
plt.ylabel('MEAN')

Based on this graph when we evaluate only women, we can see that the mean score between all the groups is better for the ones who completed the preparation course. Also, Group E has the highest mean.

In [None]:
plt.figure(figsize = (12,8))
sns.barplot(x = mean_score_male.course, y= mean_score_male.mean_score, hue = mean_score_male.race)
plt.xlabel('PREPARATION COURSE')
plt.ylabel('MEAN')


Based on this graph when we evaluate only men, we can see that the mean score for the Group E who didnt complete the course is higher than the group A who completed the course. This is one difference between male and female. Also here Group E has the highest mean.

In [None]:
base[(base.gender == 'female')].mean().plot(color = 'red')
base[(base.gender == 'male')].mean().plot(color = 'black')

Analyzing this graph, we can see that men have the highest average in mathematics. Women have a better average than men in the rest and also the total average.

In [None]:
plt.figure(figsize = (12,8))
mean_score_female.describe().plot()
mean_score_female.describe()

In [None]:
plt.figure(figsize = (12,8))
mean_score_male.describe().plot()
mean_score_male.describe()

In [None]:
plt.figure(figsize = (10,8))
sns.countplot(base.parents_level_education)
base.parents_level_education.value_counts()
plt.xlabel('PARENTS EDUCATION')
plt.ylabel('QUANTITY')

In [None]:
parents_masters = base[(base.parents_level_education == "master's degree")]
parents_bach = base[(base.parents_level_education == "bachelor's degree")]
parents_some_college = base[(base.parents_level_education == "some college")]
parents_associate = base[(base.parents_level_education == "associate's degree")]
parents_high_school = base[(base.parents_level_education == "high school")]
parents_some_high_school = base[(base.parents_level_education == "some high school")]

Now, I will plot some graphs to analyze if the parents' level of education influences the grade.

In [None]:
parents_masters.describe().plot()
parents_masters.describe()

In [None]:
parents_bach.describe().plot()
parents_bach.describe()

In [None]:
parents_some_college.describe().plot()
parents_some_college.describe()

In [None]:
parents_associate.describe().plot()
parents_associate.describe()

In [None]:
parents_high_school.describe().plot()
parents_high_school.describe()

In [None]:
parents_some_high_school.describe().plot()
parents_some_high_school.describe()

Based on the graphs above, I can say that the parents' level of education influences the grade. Parents with masters have their children with the highest marks and their children also have the highest score, among the lowest grades. As for parents who have not finished high school, their children have the lowest average grade and the lowest minimum grade in the database. From this, we can conclude that the educational level has a positive impact on the children's grades.

In [None]:
base.corr()

In [None]:
plt.figure(figsize = (10,8))
sns.scatterplot(x = base.parents_level_education, y = base.total_score, hue = base.course)
plt.xlabel('PARENTS EDUCATION')
plt.ylabel('TOTAL SCORE')


We can see in the graph above that the preparation course has a positive impact for all students who complete. However, it can be seen that it has more impact on the total score for students whose parents have a low level of education than for students whose parents have the highest level of education. The worst grades are among students who have not completed the course, regardless of their parents' level of education, concluding the positive impact of the preparation course.

In [None]:
plt.figure(figsize = (10,8))
sns.scatterplot(x = base.parents_level_education, y = base.writing, color = 'red')
sns.scatterplot(x = base.parents_level_education, y = base.reading, color = 'black')
sns.scatterplot(x = base.parents_level_education, y = base.math, color = 'blue')
plt.xlabel('PARENTS EDUCATION')
plt.ylabel('GRADES')

Through this graph, we see that the worst grades are always in mathematics, regardless of the parents' educational level.


In [None]:
plt.figure(figsize = (10,8))
sns.scatterplot(x = base.lunch, y = base.total_score)
plt.xlabel('LUNCH')
plt.ylabel('TOTAL SCORE')

Lunch also influences on the total score. As we can see, there are more students who have a grade lower than 130 with the free/reduced lunch type than students with the standard lunch. In addition, there are many more students with grades above 260 with standard lunch.

Now, I am gonna try to predict total score based on some attributes such as: gender, race, parents level of education, lunch, preparation course, writing, math and reading.

In [None]:
base.drop('mean_score', axis = 1, inplace = True)

In [None]:
base['total_score'] = base['total_score'] /3
base.head()

In [None]:
#Importing libraries

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [None]:
#Creating variables

X = base.iloc[:,0:8].values
Y = base.iloc[:,8].values

In [None]:
#LabelEncoder - transforming cateroric variables

label_encoder_X = LabelEncoder()
X[:, 0] = label_encoder_X.fit_transform(X[:, 0])
X[:, 1] = label_encoder_X.fit_transform(X[:, 1])
X[:, 2] = label_encoder_X.fit_transform(X[:, 2])
X[:, 3] = label_encoder_X.fit_transform(X[:, 3])
X[:, 4] = label_encoder_X.fit_transform(X[:, 4])

In [None]:
#Scaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
#Train and test

train_x,test_x,train_y,test_y = train_test_split(X, Y, test_size = 0.05)

In [None]:
#Decision Tree Regression

regressor = DecisionTreeRegressor()
regressor.fit(train_x, train_y)

score = regressor.score(train_x, train_y)

predicts = regressor.predict(test_x)
mae = mean_absolute_error(test_y, predicts)
print(score, mae)

In [None]:
#Random Forest Regressor

regressor = RandomForestRegressor(n_estimators = 10)
regressor.fit(train_x, train_y)

score = regressor.score(train_x, train_y)

predicts = regressor.predict(test_x)
mae = mean_absolute_error(test_y, predicts)

print(score, mae)

In [None]:
#SVR

regressor = SVR(kernel = 'linear')
regressor.fit(train_x, train_y)

score = regressor.score(train_x, train_y)

predicts = regressor.predict(test_x)
mae = mean_absolute_error(test_y, predicts)

print(score, mae)