## Building Supervised Predictive Models

### We test our data on a given dataset (training data) and evaluate its performance/generalizability on hold-out or testing data

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train=pd.read_csv("trainT.csv")
train.shape

(891, 12)

In [3]:
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test=pd.read_csv("testT.csv")
test.shape

(418, 11)

In [5]:
test.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## What happens when we don't have a seperate hold-out test dataset?

### We take our dataset and split it into training data (80%) and testing data (20%)
### We will fit the model on 80% of the data and test its performance on the 20% data set

In [6]:
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split

In [7]:
# Load the Diabetes Housing dataset
columns = "age sex bmi map tc ldl hdl tch ltg glu".split() # Declare the columns names
diabetes = datasets.load_diabetes() # Call the diabetes dataset from sklearn
df = pd.DataFrame(diabetes.data, columns=columns) # load the dataset as a pandas data frame
y = diabetes.target # define the target variable (dependent variable) as y

In [8]:
df.head(6)

Unnamed: 0,age,sex,bmi,map,tc,ldl,hdl,tch,ltg,glu
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641
5,-0.092695,-0.044642,-0.040696,-0.019442,-0.068991,-0.079288,0.041277,-0.076395,-0.04118,-0.096346


In [9]:
df.shape

(442, 10)

In [10]:
# create training and testing vars
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)#205 data as testing data
print X_train.shape, y_train.shape
print X_test.shape, y_test.shape

(353, 10) (353L,)
(89, 10) (89L,)


In [11]:
X_train.head(6)

Unnamed: 0,age,sex,bmi,map,tc,ldl,hdl,tch,ltg,glu
301,-0.001882,0.05068,-0.024529,0.052858,0.027326,0.030001,0.030232,-0.002592,-0.021394,0.036201
33,0.030811,-0.044642,-0.050396,-0.002228,-0.044223,-0.089935,0.118591,-0.076395,-0.018118,0.003064
432,0.009016,-0.044642,0.055229,-0.005671,0.057597,0.044719,-0.002903,0.023239,0.055684,0.106617
40,0.005383,0.05068,-0.008362,0.021872,0.054845,0.073215,-0.024993,0.034309,0.012553,0.094191
156,-0.016412,-0.044642,-0.010517,0.001215,-0.037344,-0.03576,0.011824,-0.039493,-0.021394,-0.034215
75,-0.009147,0.05068,-0.030996,-0.026328,-0.011201,-0.001001,-0.021311,-0.002592,0.006209,0.027917


In [12]:
X_test.head(6)

Unnamed: 0,age,sex,bmi,map,tc,ldl,hdl,tch,ltg,glu
161,-0.045472,0.05068,0.063852,0.070073,0.133274,0.131461,-0.039719,0.108111,0.075738,0.085907
322,0.023546,0.05068,0.061696,0.062039,0.024574,-0.036073,-0.091262,0.155345,0.133396,0.081764
431,0.070769,0.05068,-0.030996,0.021872,-0.037344,-0.047034,0.033914,-0.039493,-0.014956,-0.001078
35,0.048974,0.05068,-0.030996,-0.04928,0.049341,-0.004132,0.133318,-0.053516,0.021311,0.019633
398,0.012648,-0.044642,0.01535,-0.033214,0.041086,0.032193,-0.002903,-0.002592,0.045066,-0.067351
302,0.012648,-0.044642,0.033673,0.033349,0.030078,0.027183,-0.002903,0.008847,0.031193,0.027917


### Cons: Splitting data can lead to unstable results when we have a small data set

## Cross-Validation

### Divide your data into folds (each fold is a container that holds an even distribution of the cases), usually 5 or 10 (5 fold CV and 10 fold CV respectively)
### Hold out one fold as a test set and use the others as training sets
### Train and record the test set result
### Perform Steps 2 and 3 again, using each fold in turn as a test set.
### Calculate the average and the standard deviation of all the folds’ test results

In [13]:
from sklearn import datasets

In [14]:
from sklearn.cross_validation import cross_val_score



In [15]:
iris_data=datasets.load_iris()
#load the iris dataset

In [16]:
print (iris_data)

{'target_names': array(['setosa', 'versicolor', 'virginica'], 
      dtype='|S10'), 'data': array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2],
       [ 5.4,  3.9,  1.7,  0.4],
       [ 4.6,  3.4,  1.4,  0.3],
       [ 5. ,  3.4,  1.5,  0.2],
       [ 4.4,  2.9,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 5.4,  3.7,  1.5,  0.2],
       [ 4.8,  3.4,  1.6,  0.2],
       [ 4.8,  3. ,  1.4,  0.1],
       [ 4.3,  3. ,  1.1,  0.1],
       [ 5.8,  4. ,  1.2,  0.2],
       [ 5.7,  4.4,  1.5,  0.4],
       [ 5.4,  3.9,  1.3,  0.4],
       [ 5.1,  3.5,  1.4,  0.3],
       [ 5.7,  3.8,  1.7,  0.3],
       [ 5.1,  3.8,  1.5,  0.3],
       [ 5.4,  3.4,  1.7,  0.2],
       [ 5.1,  3.7,  1.5,  0.4],
       [ 4.6,  3.6,  1. ,  0.2],
       [ 5.1,  3.3,  1.7,  0.5],
       [ 4.8,  3.4,  1.9,  0.2],
       [ 5. ,  3. ,  1.6,  0.2],
       [ 5. ,  3.4,  1.6,  0.4],
       [ 5.2,  3.

In [17]:
data_input = iris_data.data
data_output = iris_data.target

In [18]:
from sklearn.cross_validation import KFold

In [19]:
kf = KFold(10, n_folds = 5, shuffle=True) #5 fold CV 

In [20]:
for train_set,test_set in kf:
    print(train_set, test_set)

(array([0, 2, 3, 4, 5, 6, 8, 9]), array([1, 7]))
(array([0, 1, 4, 5, 6, 7, 8, 9]), array([2, 3]))
(array([0, 1, 2, 3, 4, 6, 7, 8]), array([5, 9]))
(array([0, 1, 2, 3, 4, 5, 7, 9]), array([6, 8]))
(array([1, 2, 3, 5, 6, 7, 8, 9]), array([0, 4]))
