# Objective of this notebook is to learn how to split the data into train/valid/test dataset

### Method 1. Splitting the data randomly
1. Using Sklearn --> train_test_split
2. Using Fast_ml --> train_valid_test_split

### Method 2. Splitting the data using the temporal component
1. Custom Code
2. Using Fast_ml --> train_valid_test_split

In [None]:
!pip install fast_ml --upgrade

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 1000)


from fast_ml.utilities import display_all
from fast_ml import eda


import os
'''
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
'''

In [None]:
df = pd.read_csv('/kaggle/input/bluebook-for-bulldozers/TrainAndValid.csv', parse_dates=['saledate'], low_memory=False)
df.shape

In [None]:
df_summary = eda.df_info(df)
display_all(df_summary)

In [None]:
df.head()

# Method 1. Splitting the data randomly

## i. Using Sklearn --> train_test_split

In [None]:
from sklearn.model_selection import train_test_split

# Let's say we want to split the data in 80:10:10 for train:valid:test dataset
train_size=0.8

X = df.drop(columns = ['SalePrice']).copy()
y = df['SalePrice']

# In the first step we will split the data in training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.8)

# Now since we want the valid and test size to be equal (10% each of overall data). 
# we have to define valid_size=0.5 (that is 50% of remaining data)
test_size = 0.5
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)

## ii. Using Fast_ml --> train_valid_test_split

In [None]:
from fast_ml.model_development import train_valid_test_split

X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(df, target = 'SalePrice', 
                                                                            train_size=0.8, valid_size=0.1, test_size=0.1)

print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)

# Method 2. Splitting the data using the temporal component

Now, let's say you want to split the data in order to capture the sale date pattern. Most recent sales you want to put in the test set so that you are mimicing the production behavior. You are going to train model on historical data and predict in the future date

## i. Custom Code

In [None]:
train_size = 0.8
valid_size=0.1

train_index = int(len(df)*train_size)

df.sort_values(by = 'saledate', ascending=True, inplace=True)

df_train = df[0:train_index]
df_rem = df[train_index:]

valid_index = int(len(df)*valid_size)

df_valid = df[train_index:train_index+valid_index]

df_test = df[train_index+valid_index:]

X_train, y_train = df_train.drop(columns='SalePrice').copy(), df_train['SalePrice'].copy()
X_valid, y_valid = df_valid.drop(columns='SalePrice').copy(), df_valid['SalePrice'].copy()
X_test, y_test = df_test.drop(columns='SalePrice').copy(), df_test['SalePrice'].copy()
        
print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)

## ii. Using Fast_ml --> train_valid_test_split

In [None]:
X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(df, target = 'SalePrice', method='sorted', sort_by_col='saledate',
                                                                            train_size=0.8, valid_size=0.1, test_size=0.1)

        
print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)