# Train-validation-test split

In [7]:
# larger datasets (hundreds of thousands and millions of instances) should have a split ratio of 98%/1%/1% for each set,
# For a smaller dataset, the conventional split ratio is 60%/20%/20%

In [3]:
# importing dataset
import pandas as pd 

data = pd.read_csv(r'C:/Users/excel/Downloads/appliances+energy+prediction/energydata_complete.csv')
print(data.head)

<bound method NDFrame.head of                       date  Appliances  lights         T1       RH_1  \
0      2016-01-11 17:00:00          60      30  19.890000  47.596667   
1      2016-01-11 17:10:00          60      30  19.890000  46.693333   
2      2016-01-11 17:20:00          50      30  19.890000  46.300000   
3      2016-01-11 17:30:00          50      40  19.890000  46.066667   
4      2016-01-11 17:40:00          60      40  19.890000  46.333333   
...                    ...         ...     ...        ...        ...   
19730  2016-05-27 17:20:00         100       0  25.566667  46.560000   
19731  2016-05-27 17:30:00          90       0  25.500000  46.500000   
19732  2016-05-27 17:40:00         270      10  25.500000  46.596667   
19733  2016-05-27 17:50:00         420      10  25.500000  46.990000   
19734  2016-05-27 18:00:00         430      10  25.500000  46.600000   

              T2       RH_2         T3       RH_3         T4  ...         T9  \
0      19.200000  44.7900

# using indexing


In [5]:
# spliting dataset
X = data.iloc[:, 1:]
Y = data.iloc[:, 0]

In [6]:
## Print the shape of the dataset in order to determine the split ration to be used.

X.shape

## The output from this operation should be (19735, 28). This means that it is 
## possible to use a split ratio of 60%/20%/20% for the training, validation, and 
## testing sets.

(19735, 28)

In [8]:
## Get the value to use as the bottom bound of the training and validation sets. This will be used to split the dataset using indexing:

train_end = int(len(X) * 0.6) # This line calculates the index that marks the end of the training set.
# 600 samples

dev_end = int(len(X) * 0.8) # This line calculates the index that marks the end of the development (or validation) set.
# 601 - 800 samples

# the remaining 20% will be used for testing.

In [10]:
## Shuffle the dataset:
X_shuffle = X.sample(frac=1)
Y_shuffle = Y.sample(frac=1)

In [12]:
## Use indexing to split the shuffled dataset into the three sets, for both the features and the target data:

x_train = X_shuffle.iloc[:train_end,:]
y_train = Y_shuffle.iloc[:train_end]
x_dev = X_shuffle.iloc[train_end:dev_end,:]
y_dev = Y_shuffle.iloc[train_end:dev_end]
x_test = X_shuffle.iloc[dev_end:,:]
y_test = Y_shuffle.iloc[dev_end:]

In [13]:
## Print the shapes of all three sets:

print(x_train.shape, y_train.shape)
print(x_dev.shape, y_dev.shape)
print(x_test.shape, y_test.shape)

(11841, 28) (11841,)
(3947, 28) (3947,)
(3947, 28) (3947,)


# using scikit-learn's train_test_split() function

In [15]:
## Import the train_test_split() function from scikit-learn's model_selection module:

from sklearn.model_selection import train_test_split

In [17]:
## Split the shuffled dataset:

# splitting the total dataset into 80% and 20%(testing set)
x_new, x_test_2, y_new, y_test_2 = train_test_split(X_shuffle, Y_shuffle, 
test_size=0.2, random_state=0)

dev_per = x_test_2.shape[0]/x_new.shape[0]

# splitting the 80% of the total dataset into 80%(training set) and 20%(validation set)
x_train_2, x_dev_2, y_train_2, y_dev_2 = train_test_split(x_new, y_new, 
test_size=dev_per, random_state=0)

In [18]:
## Print the shape of all three sets:
 
print(x_train_2.shape, y_train_2.shape)
print(x_dev_2.shape, y_dev_2.shape)
print(x_test_2.shape, y_test_2.shape)

(11841, 28) (11841,)
(3947, 28) (3947,)
(3947, 28) (3947,)
