In [0]:
### Project Title: Prediction of Students' Test
### Authors: Zeyu Zhang and Yuchen Feng
### Date: April 15th, 2020
### Data Set Link: https://www.kaggle.com/spscientist/students-performance-in-exams

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

from sklearn import preprocessing
from sklearn.model_selection import cross_val_score as cvs
from sklearn.linear_model import LogisticRegression as lr
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.feature_selection import SelectKBest, f_classif

Read the data file and see if it is correct. Set the passing score and score of excellent.

In [None]:
passScore = 50
excellentScore = 90
dataSet = pd.read_csv("/kaggle/input/students-performance-in-exams/StudentsPerformance.csv")
dataSet.head(10)

Check if there is missing data in this data file.

In [None]:
dataSet.isnull().sum()

Describe the basic sample features including mean, standard deviation, data size, etc.

In [None]:
dataSet.describe()

Now, we will check the influence between different features and test score. Use the mean to compare.

In [None]:
dataSet.groupby(['gender']).agg(['mean'])

In [None]:
dataSet.groupby(['race/ethnicity']).agg(['mean'])

In [None]:
dataSet.groupby(['parental level of education']).agg(['mean'])

In [None]:
dataSet.groupby(['lunch']).agg(['mean'])

In [None]:
dataSet.groupby(['test preparation course']).agg(['mean'])

Describe how many students pass in each subject and pass the total. Also describe how many students get excellent score in each subject and total.

In [None]:
dataSet['Pass_Math'] = np.where(dataSet['math score'] >= passScore, 'Pass', 'Fail')
dataSet.Pass_Math.value_counts()

In [None]:
dataSet['Pass_Reading'] = np.where(dataSet['reading score'] >= passScore, 'Pass', 'Fail')
dataSet.Pass_Reading.value_counts()

In [None]:
dataSet['Pass_Writing'] = np.where(dataSet['writing score'] >= passScore, 'Pass', 'Fail')
dataSet.Pass_Writing.value_counts()

In [None]:
dataSet['Pass_All'] = np.where((dataSet['math score'] >= passScore) & 
                               (dataSet['reading score'] >= passScore) & 
                               (dataSet['writing score'] >= passScore), 'Pass', 'Fail')
dataSet.Pass_All.value_counts()

In [None]:
dataSet['Excellent_Math'] = np.where(dataSet['math score'] >= excellentScore, 'Pass', 'Not Pass')
dataSet.Excellent_Math.value_counts()

In [None]:
dataSet['Excellent_Reading'] = np.where(dataSet['reading score'] >= excellentScore, 'Pass', 'Not Pass')
dataSet.Excellent_Reading.value_counts()

In [None]:
dataSet['Excellent_Writing'] = np.where(dataSet['writing score'] >= excellentScore, 'Pass', 'Not Pass')
dataSet.Excellent_Writing.value_counts()

In [None]:
dataSet['Excellent_All'] = np.where((dataSet['math score'] >= excellentScore) & 
                                    (dataSet['reading score'] >= excellentScore) &
                                    (dataSet['writing score'] >= excellentScore), 'Pass', 'Not Pass')
dataSet.Excellent_All.value_counts()

After counting the students who passed each subject, it is time to plot the detailed figure about how many students passed all of the course and what their categories.

In [None]:
sns.set(style = 'whitegrid')
sns.countplot(x = 'gender', data = dataSet, hue = 'Pass_All', palette = 'Set1')
plt.title('Number of Students Passed All Courses', fontweight = 20, fontsize = 15)

In [None]:
sns.set(style = 'whitegrid')
sns.countplot(x = 'gender', data = dataSet, hue = 'Excellent_All', palette = 'Set3')
plt.title('Number of Students Excellent for All', fontweight = 20, fontsize = 15)

In [None]:
sns.set(style = 'whitegrid')
sns.countplot(x = 'race/ethnicity', data = dataSet, hue = 'Pass_All', palette = 'Set1')
plt.title('Number of Students Passed All Courses', fontweight = 20, fontsize = 15)

In [None]:
sns.set(style = 'whitegrid')
sns.countplot(x = 'race/ethnicity', data = dataSet, hue = 'Excellent_All', palette = 'Set3')
plt.title('Number of Students Excellent for All', fontweight = 20, fontsize = 15)

In [None]:
sns.set(style = 'whitegrid')
plot = sns.countplot(x = 'parental level of education', data = dataSet, hue = 'Pass_All', palette = 'Set1')
plt.setp(plot.get_xticklabels(), rotation = 45)
plt.title('Number of Students Passed All', fontweight = 20, fontsize = 15)

In [None]:
sns.set(style = 'whitegrid')
plot = sns.countplot(x = 'parental level of education', data = dataSet, hue = 'Excellent_All', palette = 'Set3')
plt.setp(plot.get_xticklabels(), rotation = 45)
plt.title('Number of Students Excellent for All', fontweight = 20, fontsize = 15)

In [None]:
sns.set(style = 'whitegrid')
sns.countplot(x = 'lunch', data = dataSet, hue = 'Pass_All', palette = 'Set1')
plt.title('Number of Students Passed All', fontweight = 20, fontsize = 15)

In [None]:
sns.set(style = 'whitegrid')
sns.countplot(x = 'lunch', data = dataSet, hue = 'Excellent_All', palette = 'Set3')
plt.title('Number of Students Excellent for All', fontweight = 20, fontsize = 15)

In [None]:
sns.set(style = 'whitegrid')
sns.countplot(x = 'test preparation course', data = dataSet, hue = 'Pass_All', palette = 'Set1')
plt.title('Number of Students Passed All', fontweight = 20, fontsize = 15)

In [None]:
sns.set(style = 'whitegrid')
sns.countplot(x = 'test preparation course', data = dataSet, hue = 'Excellent_All', palette = 'Set3')
plt.title('Number of Students Excellent for All', fontweight = 20, fontsize = 15)

Data preprocess. Change the described information in the table to the digital information.

In [None]:
dataSet.loc[dataSet['gender'] == 'male', 'gender'] = 1
dataSet.loc[dataSet['gender'] == 'female', 'gender'] = 0

dataSet.loc[dataSet['race/ethnicity'] == 'group A', 'race/ethnicity'] = 0
dataSet.loc[dataSet['race/ethnicity'] == 'group B', 'race/ethnicity'] = 1
dataSet.loc[dataSet['race/ethnicity'] == 'group C', 'race/ethnicity'] = 2
dataSet.loc[dataSet['race/ethnicity'] == 'group D', 'race/ethnicity'] = 3
dataSet.loc[dataSet['race/ethnicity'] == 'group E', 'race/ethnicity'] = 4

dataSet.loc[dataSet['parental level of education'] == 'some high school', 'parental level of education'] = 0
dataSet.loc[dataSet['parental level of education'] == 'high school', 'parental level of education'] = 1
dataSet.loc[dataSet['parental level of education'] == "associate's degree", 'parental level of education'] = 2
dataSet.loc[dataSet['parental level of education'] == 'some college', 'parental level of education'] = 3
dataSet.loc[dataSet['parental level of education'] == "bachelor's degree", 'parental level of education'] = 4
dataSet.loc[dataSet['parental level of education'] == "master's degree", 'parental level of education'] = 5

dataSet.loc[dataSet['lunch'] == 'standard', 'lunch'] = 1
dataSet.loc[dataSet['lunch'] == 'free/reduced', 'lunch'] = 0

dataSet.loc[dataSet['test preparation course'] == 'completed', 'test preparation course'] = 1
dataSet.loc[dataSet['test preparation course'] == 'none', 'test preparation course'] = 0

dataSet.loc[dataSet['Pass_Math'] == 'Pass', 'Pass_Math'] = 1
dataSet.loc[dataSet['Pass_Math'] == 'Fail', 'Pass_Math'] = 0

dataSet.loc[dataSet['Pass_Reading'] == 'Pass', 'Pass_Reading'] = 1
dataSet.loc[dataSet['Pass_Reading'] == 'Fail', 'Pass_Reading'] = 0

dataSet.loc[dataSet['Pass_Writing'] == 'Pass', 'Pass_Writing'] = 1
dataSet.loc[dataSet['Pass_Writing'] == 'Fail', 'Pass_Writing'] = 0

dataSet.loc[dataSet['Pass_All'] == 'Pass', 'Pass_All'] = 1
dataSet.loc[dataSet['Pass_All'] == 'Fail', 'Pass_All'] = 0

dataSet.loc[dataSet['Excellent_Math'] == 'Pass', 'Excellent_Math'] = 1
dataSet.loc[dataSet['Excellent_Math'] == 'Not Pass', 'Excellent_Math'] = 0

dataSet.loc[dataSet['Excellent_Reading'] == 'Pass', 'Excellent_Reading'] = 1
dataSet.loc[dataSet['Excellent_Reading'] == 'Not Pass', 'Excellent_Reading'] = 0

dataSet.loc[dataSet['Excellent_Writing'] == 'Pass', 'Excellent_Writing'] = 1
dataSet.loc[dataSet['Excellent_Writing'] == 'Not Pass', 'Excellent_Writing'] = 0

dataSet.loc[dataSet['Excellent_All'] == 'Pass', 'Excellent_All'] = 1
dataSet.loc[dataSet['Excellent_All'] == 'Not Pass', 'Excellent_All'] = 0

dataSet.head(10)

Machine Learning

The features are gender, race, lunch, education, preparation respectively. The targets are if they could pass the course or get excellent of the course. We want to apply machine learning method for this features and our target to see the model accuracy by using cross validation, then calculate the mean value of the array of cross value score to get the most appropiate score of model. There are 3 model we would like to use: LogisticRegression, RandomForestClassfier and DecisionTreeClassfier.

In [None]:
features = dataSet.iloc[:,:5]

target_PassMath = dataSet.iloc[:,8].astype(int)
target_PassReading = dataSet.iloc[:,9].astype(int)
target_PassWriting = dataSet.iloc[:,10].astype(int)
target_PassAll = dataSet.iloc[:,11].astype(int)
target_ExcellentMath = dataSet.iloc[:,12].astype(int)
target_ExcellentReading = dataSet.iloc[:,13].astype(int)
target_ExcellentWriting = dataSet.iloc[:,14].astype(int)
target_ExcellentAll = dataSet.iloc[:,15].astype(int)

features_cross = preprocessing.scale(features)

modelLR = lr()
modelRFC = rfc()
modelDTC = dtc()

cvsScoreLR_PassMath = np.mean(cvs(modelLR, features_cross, target_PassMath, cv = 5))
cvsScoreLR_PassReading = np.mean(cvs(modelLR, features_cross, target_PassReading, cv = 5))
cvsScoreLR_PassWriting = np.mean(cvs(modelLR, features_cross, target_PassWriting, cv = 5))
cvsScoreLR_PassAll = np.mean(cvs(modelLR, features_cross, target_PassAll, cv = 5))
cvsScoreLR_ExcellentMath = np.mean(cvs(modelLR, features_cross, target_ExcellentMath, cv = 5))
cvsScoreLR_ExcellentReading = np.mean(cvs(modelLR, features_cross, target_ExcellentReading, cv = 5))
cvsScoreLR_ExcellentWriting = np.mean(cvs(modelLR, features_cross, target_ExcellentWriting, cv = 5))
cvsScoreLR_ExcellentAll = np.mean(cvs(modelLR, features_cross, target_ExcellentAll, cv = 5))

cvsScoreRFC_PassMath = np.mean(cvs(modelRFC, features_cross, target_PassMath, cv = 5))
cvsScoreRFC_PassReading = np.mean(cvs(modelRFC, features_cross, target_PassReading, cv = 5))
cvsScoreRFC_PassWriting = np.mean(cvs(modelRFC, features_cross, target_PassWriting, cv = 5))
cvsScoreRFC_PassAll = np.mean(cvs(modelRFC, features_cross, target_PassAll, cv = 5))
cvsScoreRFC_ExcellentMath = np.mean(cvs(modelRFC, features_cross, target_ExcellentMath, cv = 5))
cvsScoreRFC_ExcellentReading = np.mean(cvs(modelRFC, features_cross, target_ExcellentReading, cv = 5))
cvsScoreRFC_ExcellentWriting = np.mean(cvs(modelRFC, features_cross, target_ExcellentWriting, cv = 5))
cvsScoreRFC_ExcellentAll = np.mean(cvs(modelRFC, features_cross, target_ExcellentAll, cv = 5))

cvsScoreDTC_PassMath = np.mean(cvs(modelDTC, features_cross, target_PassMath, cv = 5))
cvsScoreDTC_PassReading = np.mean(cvs(modelDTC, features_cross, target_PassReading, cv = 5))
cvsScoreDTC_PassWriting = np.mean(cvs(modelDTC, features_cross, target_PassWriting, cv = 5))
cvsScoreDTC_PassAll = np.mean(cvs(modelDTC, features_cross, target_PassAll, cv = 5))
cvsScoreDTC_ExcellentMath = np.mean(cvs(modelDTC, features_cross, target_ExcellentMath, cv = 5))
cvsScoreDTC_ExcellentReading = np.mean(cvs(modelDTC, features_cross, target_ExcellentReading, cv = 5))
cvsScoreDTC_ExcellentWriting = np.mean(cvs(modelDTC, features_cross, target_ExcellentWriting, cv = 5))
cvsScoreDTC_ExcellentAll = np.mean(cvs(modelDTC, features_cross, target_ExcellentAll, cv = 5))

print('Cross Validation Score of Logistic Regression (PassMath) is: ', 
      cvsScoreLR_PassMath.astype(str))
print('Cross Validation Score of Logistic Regression (PassReading) is: ', 
      cvsScoreLR_PassReading.astype(str))
print('Cross Validation Score of Logistic Regression (PassWriting) is: ', 
      cvsScoreLR_PassWriting.astype(str))
print('Cross Validation Score of Logistic Regression (PassAll) is: ', 
      cvsScoreLR_PassAll.astype(str))
print('Cross Validation Score of Logistic Regression (ExcellentMath) is: ', 
      cvsScoreLR_ExcellentMath.astype(str))
print('Cross Validation Score of Logistic Regression (ExcellentReading) is: ', 
      cvsScoreLR_ExcellentReading.astype(str))
print('Cross Validation Score of Logistic Regression (ExcellentWriting) is: ', 
      cvsScoreLR_ExcellentWriting.astype(str))
print('Cross Validation Score of Logistic Regression (ExcellentAll) is: ', 
      cvsScoreLR_ExcellentAll.astype(str))
print()
print('Cross Validation Score of Random Forest Classifier (PassMath) is: ', 
      cvsScoreRFC_PassMath.astype(str))
print('Cross Validation Score of Random Forest Classifier (PassReading) is: ', 
      cvsScoreRFC_PassReading.astype(str))
print('Cross Validation Score of Random Forest Classifier (PassWriting) is: ', 
      cvsScoreRFC_PassWriting.astype(str))
print('Cross Validation Score of Random Forest Classifier (PassAll) is: ', 
      cvsScoreRFC_PassAll.astype(str))
print('Cross Validation Score of Random Forest Classifier (ExcellentMath) is: ', 
      cvsScoreRFC_ExcellentMath.astype(str))
print('Cross Validation Score of Random Forest Classifier (ExcellentReading) is: ', 
      cvsScoreRFC_ExcellentReading.astype(str))
print('Cross Validation Score of Random Forest Classifier (ExcellentWriting) is: ', 
      cvsScoreRFC_ExcellentWriting.astype(str))
print('Cross Validation Score of Random Forest Classifier (ExcellentAll) is: ', 
      cvsScoreRFC_ExcellentAll.astype(str))
print()
print('Cross Validation Score of Decision Tree Classfier (PassMath) is: ', 
      cvsScoreDTC_PassMath.astype(str))
print('Cross Validation Score of Decision Tree Classfier (PassReading) is: ', 
      cvsScoreDTC_PassReading.astype(str))
print('Cross Validation Score of Decision Tree Classfier (PassWriting) is: ', 
      cvsScoreDTC_PassWriting.astype(str))
print('Cross Validation Score of Decision Tree Classfier (PassAll) is: ', 
      cvsScoreDTC_PassAll.astype(str))
print('Cross Validation Score of Decision Tree Classfier (ExcellentMath) is: ', 
      cvsScoreDTC_ExcellentMath.astype(str))
print('Cross Validation Score of Decision Tree Classfier (ExcellentReading) is: ', 
      cvsScoreDTC_ExcellentReading.astype(str))
print('Cross Validation Score of Decision Tree Classfier (ExcellentWriting) is: ', 
      cvsScoreDTC_ExcellentWriting.astype(str))
print('Cross Validation Score of Decision Tree Classfier (ExcellentAll) is: ', 
      cvsScoreDTC_ExcellentAll.astype(str))

Relationship of Features

We can defenitely see that our model is more accurate to predict if students could get excellent grade (greater or equit to 50) rather than predict if students could pass the courses, probably because there are more than one features which are highly related to the students who could get the passing grade (greater or equil to 50). In that case, we would like to plot the feature relationship to see how does each feature related to the student passing all courses and get excellent grade. There are 3 figures we want to know: the passing status for all courses, the excellent status for all courses, the excellent reading status.

Note: The bigger of relationship coefficient will provide the higher relationship.

In [None]:
relationship_PassAll = SelectKBest(f_classif, k = 5)
relationship_PassAll.fit(features, target_PassAll)
relationship_ExcellentAll = SelectKBest(f_classif, k = 5)
relationship_ExcellentAll.fit(features, target_ExcellentAll)
relationship_ExcellentReading = SelectKBest(f_classif, k = 5)
relationship_ExcellentReading.fit(features, target_ExcellentReading)

rCoeff_PassAll = -np.log(relationship_PassAll.pvalues_)
rCoeff_ExcellentAll = -np.log(relationship_ExcellentAll.pvalues_)
rCoeff_ExcellentReading = -np.log(relationship_ExcellentReading.pvalues_)

plt.bar(range(5), rCoeff_PassAll)
plt.xticks(range(5), features, rotation = 45)
plt.title('Relationship Between Features and Passing Status', fontweight = 15, fontsize = 10)
plt.ylabel('Relationship Coefficient')
plt.show()

From the figure above we can see that having a standard lunch contributes the highest relationship of passing all courses.

In [None]:
plt.bar(range(5), rCoeff_ExcellentAll)
plt.xticks(range(5), features, rotation = 45)
plt.title('Relationship Between Features and Excellent Status', fontweight = 15, fontsize = 10)
plt.ylabel('Relationship Coefficient')
plt.show()

From the figure above we can see that the parental level of education is the most important key for students who probably can get the excellent grade for all courses.

In [None]:
plt.bar(range(5), rCoeff_ExcellentReading)
plt.xticks(range(5), features, rotation = 45)
plt.title('Relationship Between Features and Excellent Reading Status', fontweight = 15, fontsize = 10)
plt.ylabel('Relationship Coefficient')
plt.show()

From the figure above we could see that gender is the dominating feature for students who could get the excellent score in reading course, probably because females like reading more.



-----End-----