In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1. Import the Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns

In [None]:
data = pd.read_csv("/kaggle/input/students-performance-in-exams/StudentsPerformance.csv")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
duplicate_rows = data[data.duplicated()]
print('No. of duplicate rows: ', duplicate_rows.shape)

Hence, there are no duplicate rows.

In [None]:
data.isnull().sum()

There are no Null values.


## 2. Exploratory Data Analysis

### * Univariate Analysis

In [None]:
plt.figure(figsize=(25,10))
plt.subplot(2,5,1)
plt.title("Gender",fontsize=15)
plt.ylabel("Count")
data['gender'].value_counts().plot.bar()

plt.subplot(2,5,2)
plt.title("Race/ethnicity",fontsize=15)
plt.ylabel("Count")
data['race/ethnicity'].value_counts().plot.bar()

plt.subplot(2,5,3)
plt.title("Parental level of education ",fontsize=15)
plt.ylabel("Count")
data['parental level of education'].value_counts().plot.bar()

plt.subplot(2,5,4)
plt.title("Lunch",fontsize=15)
plt.ylabel("Count")
data['lunch'].value_counts().plot.bar()

plt.subplot(2,5,5)
plt.title("Test preparation course ",fontsize=15)
plt.ylabel("Count")
data['test preparation course'].value_counts().plot.bar()


The following information can be inferred from the above graphs:

1) There are more females than males.

2) The maximum students belong to group C(more than 300) followed by groups D,B,E and A.

3) The students with their parents having masters degree are the least. Maximum parents have graduated from some college.

4) Over 300 people have free/reduced lunch whereas most of them have standard.

5) Majority of the students have not taken/completed any preparation course.

In [None]:
print("Q1 : 25 percentile")
Q1=data.quantile(0.25)
print(Q1)
print("Q2 : 25 Median")
Q2=data.quantile(0.250)
print(Q2)
print("Q3 : 75 percentile")
Q3=data.quantile(0.75)
print(Q3)
print("IQR : difference between 75th and 25th percentile")
IQR = Q3 - Q1
print(IQR)

In [None]:
data.boxplot(column=['math score','reading score','writing score'])

There are no outliers present in the data however there are few values which are exceptionally low than maximum values of the data.

In [None]:
data.describe()

The table shows the summary of the data.

From a total of 1000 records, Maths has the least mean of scores and the least minimum score. The mean in two subjects are almost the same. Every subject has a student who has scored full marks.

In [None]:
data.groupby('parental level of education').mean().plot.bar()

The mean of the subject score is greater in case of higher qualification of parents. In general, it can be concluded from the graph that Students with parents educated only till high school have less marks as compared to those whose parents have higher degree. Among the 3 subjects, maths has the least mean irrespective of the parent's degree which means that the students are better in reading and writing skills in comparison to maths if grouped on the basis of parent's level of education.

In [None]:
data.groupby('gender').mean().plot.bar()

The average maths score of female students is the lowest(compared to male students and the other 2 subjects of females) and their average reading score is highest, almost the same as their reading score.

It means that the male students are better in maths compared to female students of the class. Also, among the math,reading and writing, male students are better in maths.

In [None]:
data.groupby('lunch').mean().plot.bar()

Between free and standard lunch, students with standard lunch are better in all 3 subjects. Students with free/reduced lunch are also poor in maths compared to their reading and writing average scores. The average score in all 3 subjects is almost the same for people with standard lunch.

In [None]:
data.groupby('race/ethnicity').mean().plot.line()

Group E students are better in math,reading and writing followed by groups D,C,B and A.

Among the 3 average scores, students of all groups have more average in reading except group E whose maths mean is highest.

The writing average is between reading and maths in groups A,B and C. In group D, it is same as that of reading and it's lowest in group E.



In [None]:
data.groupby('test preparation course').mean().plot.bar()

It can be seen clearly that students who have completed the test preparation course have performed better.

* ### Bivariate Analysis

In [None]:
plt.figure(figsize=(25,10))
plt.subplot(1,3,1)
plt.title('test prep course and parental education',fontsize = 15)
sns.countplot(y='parental level of education', hue='test preparation course',data=data)
plt.subplot(132)
plt.title('parent\'s education level and lunch',fontsize = 15)
sns.countplot(y='lunch', hue='parental level of education',data=data)
plt.subplot(133)
plt.title('parent\'s education level and race/ethnicity',fontsize = 15)
sns.countplot(y='race/ethnicity', hue='parental level of education',data=data)

1) The first graph suggests the relationship between parent's education and test preparation course. Most of the students who's parental level of education is some college, associate's degree, and high school have completed the test preparation course.

2) The second graph is lunch vs parental education.

3) The graph of 'Race/ethnicity vs Parent's education' indicates that maximum group C students have their parents qualification associate degree followed by parents of some college.

In [None]:
sns.pairplot(data)

It can be inferred that the student who scores good in reading also scores good in other 2 subjects and vice versa since there is a positive correlation between all 3 scores.

Now, converting certain categorical variables to numerical variables for furth

In [None]:
data['gender_male']=data.gender.map({'female':0,'male':1})
data.head()

In [None]:
data['lunch_standard'] = data.lunch.map({'standard':1, 'free/reduced':0})
data.head()

In [None]:
data['test_prep_course_completed'] = data['test preparation course'].map({'completed':1, 'none':0})
data.head()

In [None]:
data['race_numerical'] = data['race/ethnicity'].map({'group A':1, 'group B':2,'group C':3,'group D':4,'group E':5})
data.head()

In [None]:
data['parent_edu_numerical'] = data['parental level of education'].map({'high school':1, 'some high school':2,'bachelor\'s degree':3,'some college':4,'associate\'s degree':5,'master\'s degree':6})
data.head()

In [None]:
data.corr()

The above matrix shows all the possible correlations between numerical variables.

Negative correlation is a relationship between 2 variables in which as one variable increases, the other decreases and vice versa. A perfect negative correlation is -1, a 0 indicates no correlation, and a +1 indicates a perfect positive correlation.

All of them are positive except some like 'gender=male and writing score' and 'gender=male and reading score'. Interpretation: If gender is 1 ie. male, the reading and writing scores will be less.

In [None]:
plt.figure(figsize=(10,8))

plt.title("Correlation Matrix",fontsize=20)
sns.heatmap(data.corr(),annot=True)
plt.show()

In [None]:
maths_marks = ['0-10','11-20','21-30','31-40','41-50','51-60','61-70','71-80','81-90','91-100']
data['math_grouping']=pd.cut(data['math score'],range(1,102,10),right=False,labels=maths_marks)
print(data[['math score','math_grouping']].head(5))
reading_marks = ['0-10','11-20','21-30','31-40','41-50','51-60','61-70','71-80','81-90','91-100']
data['reading_grouping'] = pd.cut(data['reading score'],range(1,102,10),right=False, labels=maths_marks)
print(data[['reading score','reading_grouping']].head(5))
maths_marks = ['0-10','11-20','21-30','31-40','41-50','51-60','61-70','71-80','81-90','91-100']
data['writing_grouping'] = pd.cut(data['writing score'],range(1,102,10),right=False, labels=maths_marks)
print(data[['writing score','writing_grouping']].head(5))

In [None]:
plt.figure(figsize=(20,20))
plt.subplots_adjust(left=0.125, bottom=0.5, right=0.9, top=0.9, wspace=0.5, hspace=0.2)
plt.subplot(131)
data.math_grouping.value_counts().plot.pie(autopct="%0.2f%%")
plt.title("Maths",fontsize=15)
plt.subplot(132)
data.reading_grouping.value_counts().plot.pie(autopct="%0.2f%%")
plt.title("Reading",fontsize=15)
plt.subplot(133)
data.writing_grouping.value_counts().plot.pie(autopct="%0.2f%%")
plt.title("Writing",fontsize=15)

In maths, the maximum scores are from 61-70(27.03%).

The maximum scores in reading are from 71-80(25.20%). A similar trend is also seen in writing scores.

Now, let us see where the concentration of marks is in a given range.

In [None]:
plt.figure(figsize=(20,5))
plt.subplot(1,3,1)
sns.distplot(data["math score"])
plt.subplot(132)
sns.distplot(data['reading score'])
plt.subplot(133)
sns.distplot(data['writing score'])


In [None]:
sns.kdeplot(data['math score'],shade=True)
sns.kdeplot(data['reading score'],shade=True)
sns.kdeplot(data['writing score'],shade=True)

In [None]:
sns.lmplot('math score','writing score',data=data,col='race/ethnicity',col_wrap=3)

In [None]:
sns.lmplot('reading score','writing score',data=data,col='parental level of education',col_wrap=3)

In [None]:
sns.lmplot('reading score','writing score',data=data,row='gender',col='lunch')

In [None]:
sns.lmplot('math score','writing score',data=data,hue='test preparation course')

All the above lmplots suggest a regression line betweeen different variables to fit the model. The data is good fit for linear regression model with less variance. However, there are a few points in some graphs far away from the line which can be considered as outliers.

Grading

Let us grade the students according to the marks obtained -

Above 80 = A Grade

70 to 80 = B Grade

60 to 70 = C Grade

50 to 60 = D Grade

40 to 50 = E Grade

Below 40 = Fail

In [None]:
data['total_marks']=data['math score']+data['reading score']+data['writing score']
data['percentage']=data['total_marks']/300*100
data['percentage'].head()

In [None]:
def grade(Percentage):
    if(Percentage>=80):
        return 'A'
    if ( Percentage >= 70):
        return 'B'
    if ( Percentage >= 60):
        return 'C'
    if ( Percentage >= 50):
        return 'D'
    if ( Percentage >= 40):
        return 'E'
    else: 
        return 'F'
data['Grade']=data['percentage'].apply(grade)
data.Grade.value_counts()


In [None]:
sns.countplot(x="Grade",data=data,order=['A','B','C','D','E','F'],palette="muted")
plt.show()

In [None]:
plt.figure(figsize=(10,8))
data['Grade'].value_counts().plot.pie(autopct="%1.1f%%")
plt.show()

Visualising grades as per race/ethnicity usimg cross tabs

In [None]:
gr=pd.crosstab(data["Grade"],data["race/ethnicity"],normalize=0)
gr

In [None]:
gr.plot.bar(stacked=True)
plt.title('Grades and Ethnicity',fontsize=15)
plt.show()

## 3. Prediction and Training of Scores

In [None]:
X = data[['gender_male','race_numerical','parent_edu_numerical','lunch_standard','test_prep_course_completed','math score','reading score']]
y = data['writing score']

In [None]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

In [None]:
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor(n_estimators=100)
model.fit(X_train,y_train)
preds=model.predict(X_test)


In [None]:
from sklearn import metrics
from sklearn.metrics import r2_score

print('MAE:', metrics.mean_absolute_error(y_test, preds))
print('MSE:', metrics.mean_squared_error(y_test, preds))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, preds)))
print("R_square score: ", r2_score(y_test,preds))