# Data

### Source

http://archive.ics.uci.edu/ml/machine-learning-databases/00320/
http://archive.ics.uci.edu/ml/datasets/Student+Performance

### Load and Understand Data

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

In [4]:
# Load data
student_mat = pd.read_csv('datasets/student-mat.csv', delimiter=';')
student_por = pd.read_csv('datasets/student-por.csv', delimiter=';')

#### Attributes for both student-mat.csv (Math course) and student-por.csv (Portuguese language course) datasets:
1. school - student's school (binary: "GP" - Gabriel Pereira or "MS" - Mousinho da Silveira)
2. sex - student's sex (binary: "F" - female or "M" - male)
3. age - student's age (numeric: from 15 to 22)
4. address - student's home address type (binary: "U" - urban or "R" - rural)
5. famsize - family size (binary: "LE3" - less or equal to 3 or "GT3" - greater than 3)
6. Pstatus - parent's cohabitation status (binary: "T" - living together or "A" - apart)
7. Medu - mother's education (numeric: 0 - none,  1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
8. Fedu - father's education (numeric: 0 - none,  1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
9. Mjob - mother's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other")
10. Fjob - father's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other")
11. reason - reason to choose this school (nominal: close to "home", school "reputation", "course" preference or "other")
12. guardian - student's guardian (nominal: "mother", "father" or "other")
13. traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)
14. studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)
15. failures - number of past class failures (numeric: n if 1<=n<3, else 4)
16. schoolsup - extra educational support (binary: yes or no)
17. famsup - family educational support (binary: yes or no)
18. paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)
19. activities - extra-curricular activities (binary: yes or no)
20. nursery - attended nursery school (binary: yes or no)
21. higher - wants to take higher education (binary: yes or no)
22. internet - Internet access at home (binary: yes or no)
23. romantic - with a romantic relationship (binary: yes or no)
24. famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)
25. freetime - free time after school (numeric: from 1 - very low to 5 - very high)
26. goout - going out with friends (numeric: from 1 - very low to 5 - very high)
27. Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
28. Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
29. health - current health status (numeric: from 1 - very bad to 5 - very good)
30. absences - number of school absences (numeric: from 0 to 93)

##### these grades are related with the course subject, Math or Portuguese:
31. G1 - first period grade (numeric: from 0 to 20)
31. G2 - second period grade (numeric: from 0 to 20)
32. G3 - final grade (numeric: from 0 to 20, output target)

Additional note: there are several (382) students that belong to both datasets . 
These students can be identified by searching for identical attributes
that characterize each student, as shown in the annexed R file.


In [5]:
# Shape
print "Shape Math course:",student_mat.shape
student_mat.head()

Shape: (395, 33)


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [6]:
# Shape
print "Shape Portuguese language course:",student_por.shape
student_por.head()

Shape: (649, 33)


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [7]:
# Utility functions to evaluate data
def evaluate_data(df):
    # Check for range of unique values for the train data
    for i in range(df.shape[1]):
        vals = np.unique(df.iloc[:, i])
        if len(vals) < 15:
            print df.columns[i], ': (Categorical) {} unique value(s) - {}'.format(len(vals), vals)
        else:
            #print '(Continuous) range of values - ', df.columns[i], ': {} to {}'.format(df.iloc[:, i].min(), df.iloc[:, i].max())
            if df.iloc[:, i].dtype == object:
                print df.columns[i], ': (Continuous) range of values of type string {',df.iloc[:, i].unique().size,' values}'
            else:
                print df.columns[i], ': (Continuous) range of values - ', '[ {} to {}]'.format(df.iloc[:, i].min(), df.iloc[:, i].max()), ' {',df.iloc[:, i].unique().size,' values}'

def columns_with_null(df):
    for column in df.columns:
        df_missing = df[df[column].isnull()]
        if df_missing.shape[0] > 0:
            print "Column " , column, " contain null values / Count = " ,df_missing.shape[0]

In [8]:
# Evalute the data
evaluate_data(student_mat)

school : (Categorical) 2 unique value(s) - ['GP' 'MS']
sex : (Categorical) 2 unique value(s) - ['F' 'M']
age : (Categorical) 8 unique value(s) - [15 16 17 18 19 20 21 22]
address : (Categorical) 2 unique value(s) - ['R' 'U']
famsize : (Categorical) 2 unique value(s) - ['GT3' 'LE3']
Pstatus : (Categorical) 2 unique value(s) - ['A' 'T']
Medu : (Categorical) 5 unique value(s) - [0 1 2 3 4]
Fedu : (Categorical) 5 unique value(s) - [0 1 2 3 4]
Mjob : (Categorical) 5 unique value(s) - ['at_home' 'health' 'other' 'services' 'teacher']
Fjob : (Categorical) 5 unique value(s) - ['at_home' 'health' 'other' 'services' 'teacher']
reason : (Categorical) 4 unique value(s) - ['course' 'home' 'other' 'reputation']
guardian : (Categorical) 3 unique value(s) - ['father' 'mother' 'other']
traveltime : (Categorical) 4 unique value(s) - [1 2 3 4]
studytime : (Categorical) 4 unique value(s) - [1 2 3 4]
failures : (Categorical) 4 unique value(s) - [0 1 2 3]
schoolsup : (Categorical) 2 unique value(s) - ['no' 

In [9]:
# Get the columns which have null data
columns_with_null(student_mat)

In [10]:
# Evalute the data
evaluate_data(student_por)

school : (Categorical) 2 unique value(s) - ['GP' 'MS']
sex : (Categorical) 2 unique value(s) - ['F' 'M']
age : (Categorical) 8 unique value(s) - [15 16 17 18 19 20 21 22]
address : (Categorical) 2 unique value(s) - ['R' 'U']
famsize : (Categorical) 2 unique value(s) - ['GT3' 'LE3']
Pstatus : (Categorical) 2 unique value(s) - ['A' 'T']
Medu : (Categorical) 5 unique value(s) - [0 1 2 3 4]
Fedu : (Categorical) 5 unique value(s) - [0 1 2 3 4]
Mjob : (Categorical) 5 unique value(s) - ['at_home' 'health' 'other' 'services' 'teacher']
Fjob : (Categorical) 5 unique value(s) - ['at_home' 'health' 'other' 'services' 'teacher']
reason : (Categorical) 4 unique value(s) - ['course' 'home' 'other' 'reputation']
guardian : (Categorical) 3 unique value(s) - ['father' 'mother' 'other']
traveltime : (Categorical) 4 unique value(s) - [1 2 3 4]
studytime : (Categorical) 4 unique value(s) - [1 2 3 4]
failures : (Categorical) 4 unique value(s) - [0 1 2 3]
schoolsup : (Categorical) 2 unique value(s) - ['no' 

In [11]:
# Get the columns which have null data
columns_with_null(student_por)