# Spliting a Dataset

In [60]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

#for spliting a dataset into training and testing 
from sklearn.model_selection import train_test_split

#for cross-validation
from sklearn.model_selection import cross_val_score

#randomise KFold in cross-validaiton
from sklearn.model_selection import KFold

#machine learning model for Cross-Validation
from sklearn.linear_model import LogisticRegression

In [61]:
pwd = os.getcwd()
data = os.path.join(pwd, "data.csv")
df = pd.read_csv(data)
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


### Feature and target selection

In [62]:
features = df[["Pclass", "Sex","Fare"]]
target = df[["Survived"]]

target_np = target.to_numpy()

# Splitting Dataset

- There are two weays to split a dataset:
    - spliting a dataset into 2 (see Method 1)
    - spliting a dataset by kfold (see Method 2)
- Both methods will be used throughout machine learning, not one better than the other but situational


## Method 1: train_test_split
- a simple splitting method to divide a dataset into 2 by proportion
- proportionally, a dataset is divided between 0.75 (X_train, y_train) and 0.25 (X_test, y_test)
- use argument test_size= *float or int* for changing the proportion
- use argument stratify=target to make sure proportional target is splitted across datasets, especially when a target is much larger than the other 
- remember that the test set (i.e. X_test, y_test) should always be reserved as unseen dataset for the final evaluation of the model.

In [126]:
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42, stratify=target)

#merging the training of features and target datasets together
X_train_y_train = pd.merge(X_train, y_train, left_index=True, right_index=True)
X_train_y_train.head()

Unnamed: 0,Pclass,Sex,Fare,Survived
486,1,female,90.0,1
238,2,male,10.5,0
722,2,male,13.0,0
184,3,female,22.025,1
56,2,female,10.5,1


- Kind in mind that a given dataset for machine learning should be divided into three dataset, i.e.:
    a. training set
    b. validaiton set
    c. test set
    
- reasons:
    - use a. training set to build the model
    - use b. validaiton set to select the parameters of the model
    - use c. test set to evaluate the performace of the selected parameters (confusion matrics and all the good stuff)
    - use a. training set and b. validation set wht parameters tested on c. test set to evaluate performace (use as much data as possible)

- While some of the coding methods could obmit splittting a dataset 3 times manually (GridSearchCV), beaware of which dataset was used during machine trainig development could affect the final result.

- always reserve X_test and y_test for evaluation, cos the next set of data to put in .predict will be truely unseen data, better be good!

- the following code explicitly divided a dataset into 3 sets. 

In [131]:
# split the whole dataset into trainval set and test set
X_trainval, X_test, y_trainval, y_test = train_test_split(features, target, random_state=0, stratify=target)

# within trainval set, split it into train set and valid set
X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval, random_state=1, stratify=y_trainval)

## Method 2: Cross-Validation (CV)
- regarding cv, read here: https://towardsdatascience.com/why-and-how-to-cross-validate-a-model-d6424b45261f
- a model is required, hence preprocessing is needed for non-numeric values
- spliting of dataset is may or many not  be randomised (e.g. use KFold to randomise)

### Some of the benefits:
- X and y will  be thoroughly tested under k number of fold, better generalisation of a model can be produced.
- Utilisation of a dataset: train\_test\_split divide a dataset by 1 time (between 75% and 25%, as default) for training and evaluation. Where cross-validation divide a dataset into several subset for training and testing (k-fold, as in cv=5).
- Cross-validation produce a range of scores to indicate the performance of a model in its best and worst case scenarios to new data.  

### Drawback:
- Computational cost, need to train k models instead of a single model. 
	- Personal option: given the speed and efficiency of modern cpu and relatively manageable data size (thousand of rows, dozen of columns max), this drawback should be manageable.  

### Caution:
- Cross-validation  is not a way to build a model that can be applied to new data.
	- Does not return a model
	- Use for evaluating how well a given algorithm will generalise when trained on a specific dataset. 

In [117]:
# instantiate a model
logreg = LogisticRegression()

#transformer
ct = ColumnTransformer([
    ("onehot", OneHotEncoder(sparse=False), ["Pclass", "Sex"]),
    ("scaling", StandardScaler(),["Fare"])
    ])

#.fit_transform to X_train
X_train_fit_trans = ct.fit_transform(features)

# here we used the full dataset, in real word situation a "test set (X_test, y_test) should be reserved for test"
# X_train_fit_trans = ct.fit_transform(X_train)

## Standard KFold - cross_val_score



In [92]:
score = cross_val_score(logreg, X_train_fit_trans, np.ravel(target), cv=5)
score

array([0.79329609, 0.80337079, 0.76966292, 0.75842697, 0.78651685])

In [87]:
score.mean()

0.7822547234950725

## Standard KFold - cross_validate

In [96]:
from sklearn.model_selection import cross_validate

res = cross_validate(logreg, X_train_fit_trans, np.ravel(target), cv=5, return_train_score=True)
res_df = pd.DataFrame(res)
display(res_df)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.006605,0.000365,0.793296,0.779494
1,0.006318,0.000294,0.803371,0.785414
2,0.004639,0.000372,0.769663,0.785414
3,0.003547,0.000294,0.758427,0.796634
4,0.006408,0.000403,0.786517,0.786816


In [99]:
res["test_score"].mean()

0.7822547234950725

## Stratified k-Fold Cross-Validation

In [101]:
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

score_kf = cross_val_score(logreg, X_train_fit_trans, np.ravel(target), cv=kf)
score_kf.mean()

0.7822547234950725

## Randomise CV with KFold

In [68]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [69]:
score_kfold = cross_val_score(logreg, X_fit_trans, np.ravel(target), cv=kfold)
score_kfold

array([0.78212291, 0.76404494, 0.83707865, 0.74719101, 0.80337079])

In [70]:
score_kfold.mean()

0.786761659657272

## Leave-one-out cross-validation

In [106]:
from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()
score_loo = cross_val_score(logreg, X_train_fit_trans, np.ravel(target), cv=loo)

len(score_loo)

891

In [104]:
score_loo.mean()

0.7833894500561167

## Shuffle-split cross-validation

In [110]:
from sklearn.model_selection import ShuffleSplit

shuffle_split = ShuffleSplit(test_size=.5, train_size=.5, n_splits=10)
scores = cross_val_score(logreg, X_train_fit_trans, np.ravel(target), cv=shuffle_split)
scores

array([0.76681614, 0.75784753, 0.77802691, 0.79596413, 0.80493274,
       0.76233184, 0.78699552, 0.77130045, 0.77578475, 0.78475336])

In [111]:
scores.mean()

0.7784753363228699