# Simple regression on dataset with categorical values

Dataset : http://roycekimmons.com/tools/generated_data/exams  

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [54]:
exam_df = pd.read_csv("./datasets/exams.csv")

In [55]:
exam_df.shape

(100, 8)

In [56]:
exam_df.sample(4)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
75,male,group C,bachelor's degree,standard,completed,80,78,77
24,female,group D,high school,standard,none,66,69,67
60,female,group C,bachelor's degree,free/reduced,none,47,71,62
86,female,group B,bachelor's degree,free/reduced,none,39,44,46


In [57]:
exam_df.dtypes

gender                         object
race/ethnicity                 object
parental level of education    object
lunch                          object
test preparation course        object
math score                      int64
reading score                   int64
writing score                   int64
dtype: object

In [58]:
exam_df.describe()

Unnamed: 0,math score,reading score,writing score
count,100.0,100.0,100.0
mean,66.73,69.98,69.14
std,15.631395,13.732642,14.886792
min,18.0,25.0,20.0
25%,58.0,61.0,62.0
50%,69.0,71.5,69.0
75%,78.25,80.0,81.0
max,96.0,94.0,93.0


## Data Pre processing

We first standardize the data. Standardizing a dataset means that the column will have 0 mean and unit variance. It will help to compare values across different distributions. It is done by subtracting the mean and dividing it by standard deviation

In [59]:
from sklearn import preprocessing

exam_df[['math score']] = preprocessing.scale(exam_df[['math score']]).astype('float64')
exam_df[['reading score']] = preprocessing.scale(exam_df[['reading score']]).astype('float64')
exam_df[['writing score']] = preprocessing.scale(exam_df[['writing score']]).astype('float64')


In [60]:
exam_df.sample(4)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
31,female,group C,high school,free/reduced,none,-0.239824,-0.071722,-0.144476
10,female,group D,associate's degree,standard,none,-0.046936,0.733324,0.665669
38,male,group E,high school,standard,none,1.110394,-0.437653,-0.414524
99,male,group B,bachelor's degree,standard,none,0.660321,-0.071722,-0.144476


here the negative score means that the values are below the mean and the positive score means that they are above the mean.

In [61]:
exam_df.describe()

Unnamed: 0,math score,reading score,writing score
count,100.0,100.0,100.0
mean,-2.642331e-16,-3.04895e-16,-4.4686480000000004e-17
std,1.005038,1.005038,1.005038
min,-3.133149,-3.291909,-3.317542
25%,-0.561305,-0.6572107,-0.482036
50%,0.1459522,0.1112428,-0.009451687
75%,0.7406911,0.7333242,0.8006929
max,1.881947,1.757929,1.610838


In [62]:
exam_df['parental level of education'].unique()

array(["associate's degree", 'some college', 'high school',
       "bachelor's degree", 'some high school', "master's degree"],
      dtype=object)

This column seems like there is a certain ordering in the values from lower to higher. So we can use label_encoding to convert the string categorical values to numeric form

In [63]:
parent_level_of_education = [
    'some high school',
    'high school',
    'some college',
    "associate's degree",
    "bachelor's degree",
    "master's degree"
]

In [64]:
label_encoding = preprocessing.LabelEncoder()
label_encoding = label_encoding.fit(parent_level_of_education)

In [65]:
exam_df['parental level of education'] = label_encoding.transform(exam_df['parental level of education'].astype(str))

In [66]:
exam_df.sample(4)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
39,female,group D,4,standard,none,0.531729,1.611557,1.340789
19,male,group E,4,standard,none,0.531729,-0.29128,-0.347012
70,female,group C,4,standard,none,1.110394,1.465185,1.543325
56,male,group D,1,standard,none,1.49617,0.586952,0.530645


In [67]:
label_encoding.classes_

array(["associate's degree", "bachelor's degree", 'high school',
       "master's degree", 'some college', 'some high school'],
      dtype='<U18')

Similarly, for column race/ethnicity we can convert this string column to numeric by using one_hot_encoding. We use one hot encoding but not label encoding because, this column does not really have a intrinsic ordering associated with it.

In [68]:
exam_df = pd.get_dummies(exam_df,columns=['race/ethnicity'])
exam_df.head()

Unnamed: 0,gender,parental level of education,lunch,test preparation course,math score,reading score,writing score,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E
0,female,0,standard,none,0.210248,0.44058,-0.009452,0,0,0,0,1
1,female,4,standard,none,-1.268562,-1.315885,-1.292181,0,0,1,0,0
2,male,2,standard,none,0.531729,0.147836,-0.076964,0,0,0,0,1
3,female,4,free/reduced,completed,-1.46145,-1.315885,-1.022132,0,1,0,0,0
4,female,1,standard,completed,0.01736,0.44058,0.733181,0,1,0,0,0


In [69]:
exam_df = pd.get_dummies(exam_df,columns=['gender','lunch','test preparation course'])
exam_df.head()

Unnamed: 0,parental level of education,math score,reading score,writing score,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,gender_female,gender_male,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none
0,0,0.210248,0.44058,-0.009452,0,0,0,0,1,1,0,0,1,0,1
1,4,-1.268562,-1.315885,-1.292181,0,0,1,0,0,1,0,0,1,0,1
2,2,0.531729,0.147836,-0.076964,0,0,0,0,1,0,1,0,1,0,1
3,4,-1.46145,-1.315885,-1.022132,0,1,0,0,0,1,0,1,0,1,0
4,1,0.01736,0.44058,0.733181,0,1,0,0,0,1,0,0,1,1,0


In [70]:
exam_df.head()

Unnamed: 0,parental level of education,math score,reading score,writing score,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,gender_female,gender_male,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none
0,0,0.210248,0.44058,-0.009452,0,0,0,0,1,1,0,0,1,0,1
1,4,-1.268562,-1.315885,-1.292181,0,0,1,0,0,1,0,0,1,0,1
2,2,0.531729,0.147836,-0.076964,0,0,0,0,1,0,1,0,1,0,1
3,4,-1.46145,-1.315885,-1.022132,0,1,0,0,0,1,0,1,0,1,0
4,1,0.01736,0.44058,0.733181,0,1,0,0,0,1,0,0,1,1,0


Hence, now we have a clean data.

## Data modelling

In [71]:
from sklearn.model_selection import train_test_split
X = exam_df.drop('math score',axis=1)
Y = exam_df['math score']

x_train,x_test, y_train,y_test = train_test_split(X,Y,test_size=0.2)

In [72]:
from sklearn.linear_model import LinearRegression
linear_model= LinearRegression(fit_intercept=True).fit(x_train,y_train)

In [78]:
print("Training score ",linear_model.score(x_train,y_train))

Training score  0.8830043920083953


In [79]:
from sklearn.metrics import r2_score
print("Test Score ",r2_score(y_pred,y_test))

Test Score  0.851959386902256


In [80]:
df_pred_actual = pd.DataFrame({'math score actual':y_test,'math score predicted':y_pred})

In [81]:
df_pred_actual.head()

Unnamed: 0,math score actual,math score predicted
46,-1.011378,-0.827447
75,0.853209,0.723177
15,1.17469,1.08731
47,1.881947,1.751685
84,0.596025,0.43357
