# Data Exploration 2

### Imports and getting set up

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

### DB Connection & Call

In [3]:
confile = list(pd.read_csv('../../dbcon.csv'))
postgres_db = 'studentsperformance'
db_connection = 'postgresql://{}:{}@{}:{}/{}'.format(confile[0], confile[1], confile[2], confile[3], postgres_db)

In [5]:
query = '''
SELECT *
FROM studentsperformance
;'''

students_df = pd.read_sql(query, db_connection)

In [7]:
display(
    students_df.head(),
    students_df.info(),
    students_df.isna().mean()
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
gender                         1000 non-null object
race/ethnicity                 1000 non-null object
parental level of education    1000 non-null object
lunch                          1000 non-null object
test preparation course        1000 non-null object
math score                     1000 non-null int64
reading score                  1000 non-null int64
writing score                  1000 non-null int64
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


None

gender                         0.0
race/ethnicity                 0.0
parental level of education    0.0
lunch                          0.0
test preparation course        0.0
math score                     0.0
reading score                  0.0
writing score                  0.0
dtype: float64

#### 1. Are there any differences between the genders, ethnicities, and parental level of education with respect to their performances in exams?

In [19]:
students_df['mathbins'] = pd.cut(students_df['math score'], bins=[0, 60, 70, 80, 90, 101], labels=['F', 'D', 'C', 'B', 'A'], right=False)
students_df['readbins'] = pd.cut(students_df['reading score'], bins=[0, 60, 70, 80, 90, 101], labels=['F', 'D', 'C', 'B', 'A'], right=False)
students_df['writbins'] = pd.cut(students_df['writing score'], bins=[0, 60, 70, 80, 90, 101], labels=['F', 'D', 'C', 'B', 'A'], right=False)

In [46]:
features = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
depvars = ['mathbins', 'readbins', 'writbins']
for feat in features:
    featdf = pd.DataFrame()
    for var in depvars:
        studentct = pd.crosstab(students_df[var], students_df[feat])
        display(studentct)

gender,female,male
mathbins,Unnamed: 1_level_1,Unnamed: 2_level_1
F,197,126
D,143,125
C,104,112
B,51,84
A,23,35


gender,female,male
readbins,Unnamed: 1_level_1,Unnamed: 2_level_1
F,91,163
D,109,124
C,151,113
B,103,67
A,64,15


gender,female,male
writbins,Unnamed: 1_level_1,Unnamed: 2_level_1
F,94,187
D,99,131
C,153,101
B,108,49
A,64,14


race/ethnicity,group A,group B,group C,group D,group E
mathbins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,42,71,113,72,25
D,22,54,95,72,25
C,16,36,62,67,35
B,5,21,33,43,33
A,4,8,16,8,22


race/ethnicity,group A,group B,group C,group D,group E
readbins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,35,55,75,62,27
D,21,50,81,57,24
C,15,37,90,80,42
B,12,36,51,42,29
A,6,12,22,21,18


race/ethnicity,group A,group B,group C,group D,group E
writbins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,37,61,94,59,30
D,24,48,73,60,25
C,13,43,77,76,45
B,10,29,49,47,22
A,5,9,26,20,18


parental level of education,associate's degree,bachelor's degree,high school,master's degree,some college,some high school
mathbins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
F,72,27,79,17,64,64
D,52,35,54,9,69,49
C,38,28,43,15,52,40
B,39,16,17,12,30,21
A,21,12,3,6,11,5


parental level of education,associate's degree,bachelor's degree,high school,master's degree,some college,some high school
readbins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
F,51,20,68,9,51,55
D,45,26,49,12,57,44
C,66,37,49,13,61,38
B,40,17,24,14,43,32
A,20,18,6,11,14,10


parental level of education,associate's degree,bachelor's degree,high school,master's degree,some college,some high school
writbins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
F,58,20,77,8,55,63
D,46,24,54,12,53,41
C,58,31,43,16,63,43
B,37,27,20,12,34,27
A,23,16,2,11,21,5


lunch,free/reduced,standard
mathbins,Unnamed: 1_level_1,Unnamed: 2_level_1
F,175,148
D,93,175
C,65,151
B,15,120
A,7,51


lunch,free/reduced,standard
readbins,Unnamed: 1_level_1,Unnamed: 2_level_1
F,130,124
D,88,145
C,82,182
B,40,130
A,15,64


lunch,free/reduced,standard
writbins,Unnamed: 1_level_1,Unnamed: 2_level_1
F,144,137
D,83,147
C,74,180
B,43,114
A,11,67


test preparation course,completed,none
mathbins,Unnamed: 1_level_1,Unnamed: 2_level_1
F,87,236
D,95,173
C,88,128
B,56,79
A,32,26


test preparation course,completed,none
readbins,Unnamed: 1_level_1,Unnamed: 2_level_1
F,56,198
D,68,165
C,104,160
B,87,83
A,43,36


test preparation course,completed,none
writbins,Unnamed: 1_level_1,Unnamed: 2_level_1
F,50,231
D,70,160
C,106,148
B,83,74
A,49,29


<span style="color:blue">There are significant differences in here most of them with respect to the performance of males vs females in computational vs non-computational areas</span> 

#### 2. Are there any differences between the lunch types with respect to their performances in exams? If there are, how do you explain this?

<span style="color:blue">There is a large difference.  The median exam scores for the free/reduced lunch bracked is an F, while the median exam score for standard lunch students is around a C.  This can be explained by the economic hardship between the families of students that would require assistance for lunch vs not.  Families that would require assistance for lunch will typically be poorer and the increased hardship on the child may prevent them from performing well.</span>

#### 3. Does the test preparation course seem to have an effect on the exam performances?

<span style="color:blue">The distribution is about the same as the distribution for the lunch where the prep students have a median score of a C while the other group has a median score of an F.  I would put that less as something to do with the course since whether or not the student takes a prep course is also affected by the same familial factors above</span>

#### 4. Which 2 exam scores are most correlated with each other?

<span style="color:blue">As seen and stated in the data above, the reading/writing are the most correlated with one another being the two non-computational exams, while math is the only computational subject here with data.</span>