# Filtering dataframe and cleaning data

In [2]:
## Only run this block the first time you run the script 
# pip install xlrd
# pip install openpyxl

In [2]:
# Packages and libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Data 
df = pd.read_excel("data_academic_performance.xlsx")
# settings to display all columns
pd.set_option("display.max_columns", None)
# display the dataframe head
# df.head()

In [5]:
# Dropping irrelevant columns and saving the dataframe
df.drop(["Unnamed: 9","COD_S11", "EDU_FATHER" ,"EDU_MOTHER" ,"OCC_FATHER" ,"OCC_MOTHER" ,"SISBEN" ,"PEOPLE_HOUSE" ,"INTERNET" ,"TV" ,"COMPUTER" ,"WASHING_MCH" ,"MIC_OVEN" ,"CAR" ,"DVD" ,"FRESH" ,"PHONE" ,"MOBILE" ,"REVENUE" ,"JOB" ,"SCHOOL_NAME" ,"SCHOOL_NAT" ,"SCHOOL_TYPE" ,"MAT_S11" ,"BIO_S11","Cod_SPro" ,"UNIVERSITY" ,"ACADEMIC_PROGRAM" ,"QR_PRO","WC_PRO" ,"FEP_PRO" ,"G_SC" ,"PERCENTILE" ,"2ND_DECILE" ,"QUARTILE" ,"SEL" ,"SEL_IHE"] , inplace=True, axis=1)

In [6]:
# mean score of highschool and of college
df["COL_GRADE_AVG"] = df[["CR_PRO", "CC_PRO", "ENG_PRO"]].mean(axis=1)
df["HI_GRADE_AVG"] = df[["CR_S11", "CC_S11", "ENG_S11"]].mean(axis=1)

In [7]:
# Creating the columns Female and Male
df_one = pd.get_dummies(df["GENDER"])
# Females equals 1 and Men 0
df["GENDER_bin"] = df_one.F
df


Unnamed: 0,GENDER,STRATUM,CR_S11,CC_S11,ENG_S11,CR_PRO,CC_PRO,ENG_PRO,COL_GRADE_AVG,HI_GRADE_AVG,GENDER_bin
0,F,Stratum 4,81,61,82,93,71,93,85.666667,74.666667,1
1,F,Stratum 5,75,66,88,38,86,98,74.000000,76.333333,1
2,M,Stratum 2,49,38,42,1,18,43,20.666667,43.000000,0
3,F,Stratum 2,55,51,73,35,76,80,63.666667,59.666667,1
4,M,Stratum 4,65,76,92,94,98,100,97.333333,77.666667,0
...,...,...,...,...,...,...,...,...,...,...,...
12406,M,Stratum 2,69,70,81,71,86,87,81.333333,73.333333,0
12407,M,Stratum 2,57,61,53,39,44,11,31.333333,57.000000,0
12408,M,Stratum 2,69,75,58,88,90,81,86.333333,67.333333,0
12409,F,Stratum 3,69,64,52,80,51,8,46.333333,61.666667,1


In [8]:
df.to_csv("df_sum_score.csv")

#### Make list of paired lists

In [9]:
df_document_features = df.copy()
df_document_features = df_document_features[["GENDER_bin", "HI_GRADE_AVG"]]


In [16]:
df.index.values


array([    0,     1,     2, ..., 12408, 12409, 12410], dtype=int64)

In [11]:
df_document_features = df_document_features.values.tolist()

In [12]:
doc_features = np.array(df_document_features)

In [13]:
#scores
doc_scores = df["COL_GRADE_AVG"].values
doc_scores

array([85.66666667, 74.        , 20.66666667, ..., 86.33333333,
       46.33333333, 88.33333333])

# Basic stats

In [14]:
df.groupby('GENDER').agg(['mean', 'median'])

Unnamed: 0_level_0,CR_S11,CR_S11,CC_S11,CC_S11,ENG_S11,ENG_S11,CR_PRO,CR_PRO,CC_PRO,CC_PRO,ENG_PRO,ENG_PRO,COL_GRADE_AVG,COL_GRADE_AVG,HI_GRADE_AVG,HI_GRADE_AVG,GENDER_bin,GENDER_bin
Unnamed: 0_level_1,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median
GENDER,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
F,60.8953,61,59.959944,60,61.45112,59,61.338687,66,58.573865,63,66.532421,72,62.148324,64.333333,60.768788,60.0,1,1
M,60.698426,61,61.215255,61,62.040581,59,62.788409,69,59.60627,65,68.159473,76,63.518051,66.333333,61.318087,60.666667,0,0


# Standardized dataframe

In [12]:
df0 = pd.read_excel("data_academic_performance.xlsx")
df0.drop(["Unnamed: 9","COD_S11", "EDU_FATHER" ,"EDU_MOTHER" ,"OCC_FATHER" ,"OCC_MOTHER" ,"SISBEN" ,"PEOPLE_HOUSE" ,"INTERNET" ,"TV" ,"COMPUTER" ,"WASHING_MCH" ,"MIC_OVEN" ,"CAR" ,"DVD" ,"FRESH" ,"PHONE" ,"MOBILE" ,"REVENUE" ,"JOB" ,"SCHOOL_NAME" ,"SCHOOL_NAT" ,"SCHOOL_TYPE" ,"MAT_S11" ,"BIO_S11","Cod_SPro" ,"UNIVERSITY" ,"ACADEMIC_PROGRAM" ,"QR_PRO","WC_PRO" ,"FEP_PRO" ,"G_SC" ,"PERCENTILE" ,"2ND_DECILE" ,"QUARTILE" ,"SEL" ,"SEL_IHE"] , inplace=True, axis=1)
df0["COL_GRADE_AVG_org"] = df0[["CR_PRO", "CC_PRO", "ENG_PRO"]].mean(axis=1)
df0["HI_GRADE_AVG_org"] = df0[["CR_S11", "CC_S11", "ENG_S11"]].mean(axis=1)
# Creating the columns Female and Male
df_one = pd.get_dummies(df0["GENDER"])
# Females equals 1 and Men 0
df0["GENDER_bin"] = df_one.F
df0

Unnamed: 0,GENDER,STRATUM,CR_S11,CC_S11,ENG_S11,CR_PRO,CC_PRO,ENG_PRO,COL_GRADE_AVG_org,HI_GRADE_AVG_org,GENDER_bin
0,F,Stratum 4,81,61,82,93,71,93,85.666667,74.666667,1
1,F,Stratum 5,75,66,88,38,86,98,74.000000,76.333333,1
2,M,Stratum 2,49,38,42,1,18,43,20.666667,43.000000,0
3,F,Stratum 2,55,51,73,35,76,80,63.666667,59.666667,1
4,M,Stratum 4,65,76,92,94,98,100,97.333333,77.666667,0
...,...,...,...,...,...,...,...,...,...,...,...
12406,M,Stratum 2,69,70,81,71,86,87,81.333333,73.333333,0
12407,M,Stratum 2,57,61,53,39,44,11,31.333333,57.000000,0
12408,M,Stratum 2,69,75,58,88,90,81,86.333333,67.333333,0
12409,F,Stratum 3,69,64,52,80,51,8,46.333333,61.666667,1


In [18]:
# standardize Col grade avg and HI grade avg
df0["COL_GRADE_AVG"] = (df0["COL_GRADE_AVG_org"] - df0["COL_GRADE_AVG_org"].mean())/df0["COL_GRADE_AVG_org"].std() 
df0["HI_GRADE_AVG"] = (df0["HI_GRADE_AVG_org"] - df0["HI_GRADE_AVG_org"].mean())/df0["HI_GRADE_AVG_org"].std() 

In [19]:
print(df0["COL_GRADE_AVG"].mean(), df0["COL_GRADE_AVG"].std())

-1.104317221379185e-14 1.0000000000000047


In [21]:
print(df0["HI_GRADE_AVG"].mean(), df0["HI_GRADE_AVG"].std())

-1.8119929320857652e-14 0.9999999999999991


In [22]:
df0

Unnamed: 0,GENDER,STRATUM,CR_S11,CC_S11,ENG_S11,CR_PRO,CC_PRO,ENG_PRO,COL_GRADE_AVG_org,HI_GRADE_AVG_org,GENDER_bin,COL_GRADE_AVG,HI_GRADE_AVG
0,F,Stratum 4,81,61,82,93,71,93,85.666667,74.666667,1,0.995599,1.371666
1,F,Stratum 5,75,66,88,38,86,98,74.000000,76.333333,1,0.484028,1.540112
2,M,Stratum 2,49,38,42,1,18,43,20.666667,43.000000,0,-1.854585,-1.828806
3,F,Stratum 2,55,51,73,35,76,80,63.666667,59.666667,1,0.030921,-0.144347
4,M,Stratum 4,65,76,92,94,98,100,97.333333,77.666667,0,1.507171,1.674869
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12406,M,Stratum 2,69,70,81,71,86,87,81.333333,73.333333,0,0.805587,1.236910
12407,M,Stratum 2,57,61,53,39,44,11,31.333333,57.000000,0,-1.386863,-0.413860
12408,M,Stratum 2,69,75,58,88,90,81,86.333333,67.333333,0,1.024832,0.630504
12409,F,Stratum 3,69,64,52,80,51,8,46.333333,61.666667,1,-0.729128,0.057788


In [24]:
#save standardized df
df0 = df0[df0.STRATUM!='0']
df0.to_csv("df_sum_score_standardized.csv")

In [25]:
~df0.columns.isin(['GENDER'])

array([False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])