# Scaling Numerical Data
## Demo

In [18]:
import pandas as pd

In [19]:
print(pd.__version__)

1.2.1


### Load Data

In [20]:
exam_data = pd.read_csv("http://roycekimmons.com/system/generate_data.php?dataset=exams&n=100",quotechar="\"")
exam_data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,male,group E,bachelor's degree,standard,completed,49,52,52
1,male,group C,associate's degree,free/reduced,none,31,42,28
2,male,group D,some college,free/reduced,completed,76,70,69
3,female,group C,associate's degree,standard,completed,90,99,94
4,female,group A,associate's degree,standard,none,87,88,95
...,...,...,...,...,...,...,...,...
95,male,group E,high school,standard,completed,61,61,56
96,female,group D,master's degree,free/reduced,none,75,78,75
97,female,group D,some high school,standard,none,53,62,69
98,female,group B,high school,standard,completed,75,84,88


### Show current average

In [21]:
math_average = exam_data['math score'].mean()
reading_average = exam_data['reading score'].mean()
writing_average = average = exam_data['writing score'].mean()

print('Math Avg: ', math_average)
print('Reading Avg: ', reading_average)
print('Writing Avg: ', writing_average)

Math Avg:  66.32
Reading Avg:  68.2
Writing Avg:  67.75


### Scale data

In [22]:
from sklearn import preprocessing

exam_data[['math score']]=preprocessing.scale(exam_data[['math score']])
exam_data[['reading score']]=preprocessing.scale(exam_data[['reading score']])
exam_data[['writing score']]=preprocessing.scale(exam_data[['writing score']])

In [23]:
exam_data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,male,group E,bachelor's degree,standard,completed,-1.167349,-1.065375,-0.976722
1,male,group C,associate's degree,free/reduced,none,-2.380530,-1.723013,-2.465060
2,male,group D,some college,free/reduced,completed,0.652421,0.118375,0.077518
3,female,group C,associate's degree,standard,completed,1.596006,2.025527,1.627870
4,female,group A,associate's degree,standard,none,1.393810,1.302124,1.689884
...,...,...,...,...,...,...,...,...
95,male,group E,high school,standard,completed,-0.358562,-0.473500,-0.728666
96,female,group D,master's degree,free/reduced,none,0.585023,0.644486,0.449602
97,female,group D,some high school,standard,none,-0.897754,-0.407736,0.077518
98,female,group B,high school,standard,completed,0.585023,1.039069,1.255785


In [24]:
math_average = exam_data['math score'].mean()
reading_average = exam_data['reading score'].mean()
writing_average = average = exam_data['writing score'].mean()

print('Math Avg: ', math_average)
print('Reading Avg: ', reading_average)
print('Writing Avg: ', writing_average)

Math Avg:  4.804490139065365e-16
Reading Avg:  -2.2759572004815707e-16
Writing Avg:  -2.220446049250313e-18


# Label gender

In [25]:
le = preprocessing.LabelEncoder()
exam_data['gender']=le.fit_transform(exam_data['gender'].astype(str))

In [26]:
exam_data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,1,group E,bachelor's degree,standard,completed,-1.167349,-1.065375,-0.976722
1,1,group C,associate's degree,free/reduced,none,-2.38053,-1.723013,-2.46506
2,1,group D,some college,free/reduced,completed,0.652421,0.118375,0.077518
3,0,group C,associate's degree,standard,completed,1.596006,2.025527,1.62787
4,0,group A,associate's degree,standard,none,1.39381,1.302124,1.689884


### Display mapping

In [27]:
le.classes_

array(['female', 'male'], dtype=object)

### One hot representation

In [28]:
pd.get_dummies(exam_data['race/ethnicity'])

Unnamed: 0,group A,group B,group C,group D,group E
0,0,0,0,0,1
1,0,0,1,0,0
2,0,0,0,1,0
3,0,0,1,0,0
4,1,0,0,0,0
...,...,...,...,...,...
95,0,0,0,0,1
96,0,0,0,1,0
97,0,0,0,1,0
98,0,1,0,0,0


#### Assign race one hot to table

In [29]:
exam_data = pd.get_dummies(exam_data,columns=['race/ethnicity'])

In [30]:
exam_data

Unnamed: 0,gender,parental level of education,lunch,test preparation course,math score,reading score,writing score,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E
0,1,bachelor's degree,standard,completed,-1.167349,-1.065375,-0.976722,0,0,0,0,1
1,1,associate's degree,free/reduced,none,-2.380530,-1.723013,-2.465060,0,0,1,0,0
2,1,some college,free/reduced,completed,0.652421,0.118375,0.077518,0,0,0,1,0
3,0,associate's degree,standard,completed,1.596006,2.025527,1.627870,0,0,1,0,0
4,0,associate's degree,standard,none,1.393810,1.302124,1.689884,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
95,1,high school,standard,completed,-0.358562,-0.473500,-0.728666,0,0,0,0,1
96,0,master's degree,free/reduced,none,0.585023,0.644486,0.449602,0,0,0,1,0
97,0,some high school,standard,none,-0.897754,-0.407736,0.077518,0,0,0,1,0
98,0,high school,standard,completed,0.585023,1.039069,1.255785,0,1,0,0,0


#### Multicolumn One Hot Notation

In [31]:
exam_data = pd.get_dummies(exam_data,columns=['parental level of education',
                                             'lunch',
                                             'test preparation course'])

In [32]:
exam_data.head()

Unnamed: 0,gender,math score,reading score,writing score,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_associate's degree,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none
0,1,-1.167349,-1.065375,-0.976722,0,0,0,0,1,0,1,0,0,0,0,0,1,1,0
1,1,-2.38053,-1.723013,-2.46506,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1
2,1,0.652421,0.118375,0.077518,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0
3,0,1.596006,2.025527,1.62787,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0
4,0,1.39381,1.302124,1.689884,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1
