# Filtering and cleaning of dataframe to save for use

Load packages

In [1]:
## Only run this block the first time you run the script 
# pip install xlrd
# pip install openpyxl

In [2]:
# Packages and libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Load initial data

In [12]:
# Data 
df = pd.read_excel("data_academic_performance.xlsx")

# settings to display all columns
pd.set_option("display.max_columns", None)

# display the dataframe head
# df.head()

In [13]:
# Dropping irrelevant columns and saving the dataframe
df.drop(["Unnamed: 9","COD_S11", "EDU_FATHER" ,"EDU_MOTHER" ,"OCC_FATHER" ,"OCC_MOTHER" ,"SISBEN" ,"PEOPLE_HOUSE" ,"INTERNET" ,"TV" ,"COMPUTER" ,"WASHING_MCH" ,"MIC_OVEN" ,"CAR" ,"DVD" ,"FRESH" ,"PHONE" ,"MOBILE" ,"REVENUE" ,"JOB" ,"SCHOOL_NAME" ,"SCHOOL_NAT" ,"SCHOOL_TYPE" ,"MAT_S11" ,"BIO_S11","Cod_SPro" ,"UNIVERSITY" ,"ACADEMIC_PROGRAM" ,"QR_PRO","WC_PRO" ,"FEP_PRO" ,"G_SC" ,"PERCENTILE" ,"2ND_DECILE" ,"QUARTILE" ,"SEL" ,"SEL_IHE"] , inplace=True, axis=1)

In [14]:
# mean score of highschool and of college
df["COL_GRADE_AVG"] = df[["CR_PRO", "CC_PRO", "ENG_PRO"]].mean(axis=1)
df["HI_GRADE_AVG"] = df[["CR_S11", "CC_S11", "ENG_S11"]].mean(axis=1)

In [15]:
# Replace spaces in stratum values with '_'
df.replace(' ','_',regex=True,inplace=True)

In [16]:
# Creating the dummy columns of Female and Male
df_one = pd.get_dummies(df["GENDER"])

# Females equals 1 and Men 0
df["GENDER_bin"] = df_one.F

#final df
df

Unnamed: 0,GENDER,STRATUM,CR_S11,CC_S11,ENG_S11,CR_PRO,CC_PRO,ENG_PRO,COL_GRADE_AVG,HI_GRADE_AVG,GENDER_bin
0,F,Stratum_4,81,61,82,93,71,93,85.666667,74.666667,1
1,F,Stratum_5,75,66,88,38,86,98,74.000000,76.333333,1
2,M,Stratum_2,49,38,42,1,18,43,20.666667,43.000000,0
3,F,Stratum_2,55,51,73,35,76,80,63.666667,59.666667,1
4,M,Stratum_4,65,76,92,94,98,100,97.333333,77.666667,0
...,...,...,...,...,...,...,...,...,...,...,...
12406,M,Stratum_2,69,70,81,71,86,87,81.333333,73.333333,0
12407,M,Stratum_2,57,61,53,39,44,11,31.333333,57.000000,0
12408,M,Stratum_2,69,75,58,88,90,81,86.333333,67.333333,0
12409,F,Stratum_3,69,64,52,80,51,8,46.333333,61.666667,1


In [8]:
df.to_csv("df_sum_score.csv")

# Normalized dataframe
Rescale the feature to range [0;1] by subtracting the minimum value of the feature then dividing by the range.


In [10]:
#apply normalization to filtered original dataframe
dfn = df.copy()

# apply normalization techniques
for column in dfn.columns:
    try:
        dfn[column] = (dfn[column] - dfn[column].min()) / (dfn[column].max() - dfn[column].min()) 
    except:
        pass

#view normalized data
dfn

Unnamed: 0,GENDER,STRATUM,CR_S11,CC_S11,ENG_S11,CR_PRO,CC_PRO,ENG_PRO,COL_GRADE_AVG,HI_GRADE_AVG,GENDER_bin
0,F,Stratum 4,0.750000,0.61,0.756757,0.929293,0.707071,0.929293,0.855219,0.657658,1.0
1,F,Stratum 5,0.671053,0.66,0.837838,0.373737,0.858586,0.979798,0.737374,0.680180,1.0
2,M,Stratum 2,0.328947,0.38,0.216216,0.000000,0.171717,0.424242,0.198653,0.229730,0.0
3,F,Stratum 2,0.407895,0.51,0.635135,0.343434,0.757576,0.797980,0.632997,0.454955,1.0
4,M,Stratum 4,0.539474,0.76,0.891892,0.939394,0.979798,1.000000,0.973064,0.698198,0.0
...,...,...,...,...,...,...,...,...,...,...,...
12406,M,Stratum 2,0.592105,0.70,0.743243,0.707071,0.858586,0.868687,0.811448,0.639640,0.0
12407,M,Stratum 2,0.434211,0.61,0.364865,0.383838,0.434343,0.101010,0.306397,0.418919,0.0
12408,M,Stratum 2,0.592105,0.75,0.432432,0.878788,0.898990,0.808081,0.861953,0.558559,0.0
12409,F,Stratum 3,0.592105,0.64,0.351351,0.797980,0.505051,0.070707,0.457912,0.481982,1.0


Save normalized df

In [11]:
dfn.to_csv("df_normalized_score.csv")

# Standardized dataframe
Subtract mean and divide by standard deviation


In [25]:
#apply standardization to filtered original dataframe
dfs = df.copy()

# apply standardixation techniques

#df0["COL_GRADE_AVG"] = (df0["COL_GRADE_AVG_org"] - df0["COL_GRADE_AVG_org"].mean())/df0["COL_GRADE_AVG_org"].std() 
#df0["HI_GRADE_AVG\"] = (df0[\"HI_GRADE_AVG_org\"] - df0[\"HI_GRADE_AVG_org\"].mean())/df0[\"HI_GRADE_AVG_org\"].std()


for column in dfs.columns:
    try:
        dfs[column] = (dfs[column] - dfs[column].mean())/dfs[column].std()  
    except:
        pass
    
#Check means are 0 and std 1
print("mean column HI grade: ", dfs.HI_GRADE_AVG.mean(), "std column HI grade: ", dfs.HI_GRADE_AVG.std(),)
print("mean column COL grade: ", dfs.COL_GRADE_AVG.mean(), "std column COL grade: ", dfs.COL_GRADE_AVG.std(),)
    
#view normalized data
dfs

mean column HI grade:  -1.8119929320857652e-14 std column HI grade:  0.9999999999999991
mean column COL grade:  -1.104317221379185e-14 std column COL grade:  1.0000000000000047


Unnamed: 0,GENDER,STRATUM,CR_S11,CC_S11,ENG_S11,CR_PRO,CC_PRO,ENG_PRO,COL_GRADE_AVG,HI_GRADE_AVG,GENDER_bin
0,F,Stratum_4,2.016939,0.029131,1.412733,1.113281,0.407467,1.000257,0.995599,1.371666,1.208684
1,F,Stratum_5,1.418487,0.523176,1.832378,-0.874678,0.924854,1.196373,0.484028,1.540112,1.208684
2,M,Stratum_2,-1.174802,-2.243479,-1.384905,-2.212033,-1.420633,-0.960904,-1.854585,-1.828806,-0.827279
3,F,Stratum_2,-0.576351,-0.958960,0.783264,-0.983113,0.579930,0.490355,0.030921,-0.144347,1.208684
4,M,Stratum_4,0.421068,1.511268,2.112142,1.149426,1.338764,1.274820,1.507171,1.674869,-0.827279
...,...,...,...,...,...,...,...,...,...,...,...
12406,M,Stratum_2,0.820036,0.918413,1.342792,0.318097,0.924854,0.764918,0.805587,1.236910,-0.827279
12407,M,Stratum_2,-0.376867,0.029131,-0.615555,-0.838534,-0.523829,-2.216048,-1.386863,-0.413860,-0.827279
12408,M,Stratum_2,0.820036,1.412458,-0.265850,0.932558,1.062824,0.529578,1.024832,0.630504,-0.827279
12409,F,Stratum_3,0.820036,0.325558,-0.685496,0.643400,-0.282382,-2.333717,-0.729128,0.057788,1.208684


Save standardized df

In [26]:
dfs.to_csv("df_standardized_score.csv")

# One hot encoding

#### Make list of paired lists

In [9]:
df_document_features = df.copy()
df_document_features = df_document_features[["GENDER_bin", "HI_GRADE_AVG"]]


In [16]:
df.index.values


array([    0,     1,     2, ..., 12408, 12409, 12410], dtype=int64)

In [11]:
df_document_features = df_document_features.values.tolist()

In [12]:
doc_features = np.array(df_document_features)

In [13]:
#scores
doc_scores = df["COL_GRADE_AVG"].values
doc_scores

array([85.66666667, 74.        , 20.66666667, ..., 86.33333333,
       46.33333333, 88.33333333])

# Basic stats

In [14]:
df.groupby('GENDER').agg(['mean', 'median'])

Unnamed: 0_level_0,CR_S11,CR_S11,CC_S11,CC_S11,ENG_S11,ENG_S11,CR_PRO,CR_PRO,CC_PRO,CC_PRO,ENG_PRO,ENG_PRO,COL_GRADE_AVG,COL_GRADE_AVG,HI_GRADE_AVG,HI_GRADE_AVG,GENDER_bin,GENDER_bin
Unnamed: 0_level_1,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median
GENDER,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
F,60.8953,61,59.959944,60,61.45112,59,61.338687,66,58.573865,63,66.532421,72,62.148324,64.333333,60.768788,60.0,1,1
M,60.698426,61,61.215255,61,62.040581,59,62.788409,69,59.60627,65,68.159473,76,63.518051,66.333333,61.318087,60.666667,0,0
