In [1]:
# import dependencies
import pandas as pd

# Database


# Machine Learning


In [2]:
# read in data
df = pd.read_csv("resources/exams.csv")
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,male,group A,high school,standard,completed,67,67,63
1,female,group D,some high school,free/reduced,none,40,59,55
2,male,group E,some college,free/reduced,none,59,60,50
3,male,group B,high school,standard,none,77,78,68
4,male,group E,associate's degree,standard,completed,78,73,68


## Data Preprocessing

In [3]:
# add "overall score" column
score_sum = df[["math score", "reading score", "writing score"]].sum(axis=1)
df["overall score"] = (score_sum / 300) * 100

df["overall score"] = df["overall score"].map("{:.0f}".format)

df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,overall score
0,male,group A,high school,standard,completed,67,67,63,66
1,female,group D,some high school,free/reduced,none,40,59,55,51
2,male,group E,some college,free/reduced,none,59,60,50,56
3,male,group B,high school,standard,none,77,78,68,74
4,male,group E,associate's degree,standard,completed,78,73,68,73


In [4]:
# add "pass/fail" column --- pass: 1, fail: 0
target = []

for x in df["overall score"]:
    if int(x) < 65:
        target.append(0)
    else:
        target.append(1)
        
df["pass/fail"] = target
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,overall score,pass/fail
0,male,group A,high school,standard,completed,67,67,63,66,1
1,female,group D,some high school,free/reduced,none,40,59,55,51,0
2,male,group E,some college,free/reduced,none,59,60,50,56,0
3,male,group B,high school,standard,none,77,78,68,74,1
4,male,group E,associate's degree,standard,completed,78,73,68,73,1


In [5]:
# drop "race/ethnicity column", as it's labeling not specific enough to provide insight for our model
df = df.drop(df.columns[[1]], axis=1)
df.head()

Unnamed: 0,gender,parental level of education,lunch,test preparation course,math score,reading score,writing score,overall score,pass/fail
0,male,high school,standard,completed,67,67,63,66,1
1,female,some high school,free/reduced,none,40,59,55,51,0
2,male,some college,free/reduced,none,59,60,50,56,0
3,male,high school,standard,none,77,78,68,74,1
4,male,associate's degree,standard,completed,78,73,68,73,1


#### Three different ways to encode the data. I think we should go with df_custom because I think it'll make it easier for us to interpret the data since we'll know exactly which number means what and we don't have a ton of added columns

#### Method 1: Encoding using only 1s and 0s for string data + adding extra columns

In [6]:
# convert str to numeric data for machine learning model
df_encoded = pd.get_dummies(df, columns=[
                                "gender", 
                                "parental level of education",
                                "lunch",
                                "test preparation course"])
df_encoded.head()

Unnamed: 0,math score,reading score,writing score,overall score,pass/fail,gender_female,gender_male,parental level of education_associate's degree,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none
0,67,67,63,66,1,0,1,0,0,1,0,0,0,0,1,1,0
1,40,59,55,51,0,1,0,0,0,0,0,0,1,1,0,0,1
2,59,60,50,56,0,0,1,0,0,0,0,1,0,1,0,0,1
3,77,78,68,74,1,0,1,0,0,1,0,0,0,0,1,0,1
4,78,73,68,73,1,0,1,1,0,0,0,0,0,0,1,1,0


#### Method 2: Custom encoding

In [7]:
df_custom = df.copy()

# make individual dictionaries for better understanding of results
gender = {
    "male": 0,
    "female": 1}

parent = {
    "some high school": 0,
    "high school": 1,
    "some college": 2,
    "associate's degree": 3,
    "bachelor's degree": 4,
    "master's degree": 5}

lunch = {
    "free/reduced": 0,
    "standard": 1}

test_prep = {
    "none": 0,
    "completed": 1}

# apply encoding conversion
df_custom["gender"] = df_custom["gender"].apply(lambda x: gender[x])
df_custom["parental level of education"] = df_custom["parental level of education"].apply(lambda x: parent[x])
df_custom["lunch"] = df_custom["lunch"].apply(lambda x: lunch[x])
df_custom["test preparation course"] = df_custom["test preparation course"].apply(lambda x: test_prep[x])

df_custom.head()

Unnamed: 0,gender,parental level of education,lunch,test preparation course,math score,reading score,writing score,overall score,pass/fail
0,0,1,1,1,67,67,63,66,1
1,1,0,0,0,40,59,55,51,0
2,0,2,0,0,59,60,50,56,0
3,0,1,1,0,77,78,68,74,1
4,0,3,1,1,78,73,68,73,1


#### Method 3: Label Encoding - similiar to custom encoding, however we don't have control over which number is assigned to what, so we will need to compare it to the dataset to analyze

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_label_encoder = df.copy()

# apply encoding conversion
df_label_encoder["gender"] = le.fit_transform(df_label_encoder["gender"])
df_label_encoder["parental level of education"] = le.fit_transform(df_label_encoder["parental level of education"])
df_label_encoder["lunch"] = le.fit_transform(df_label_encoder["lunch"])
df_label_encoder["test preparation course"] = le.fit_transform(df_label_encoder["test preparation course"])

df_label_encoder.head()

Unnamed: 0,gender,parental level of education,lunch,test preparation course,math score,reading score,writing score,overall score,pass/fail
0,1,2,1,0,67,67,63,66,1
1,0,5,0,1,40,59,55,51,0
2,1,4,0,1,59,60,50,56,0
3,1,2,1,1,77,78,68,74,1
4,1,0,1,0,78,73,68,73,1
