# Train Test Split

Snippets to split a dataset into its training and testing components based on some ratio.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

## Read the CSV file as a Data Frame

In [2]:
df = pd.read_csv("phoneprices.csv")

df

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,...,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,8,...,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,5,...,336,670,869,18,10,19,1,1,1,0


### Extracting a sub-data frame based on specific columns

Specify an array of string columns within `df[]`

In [3]:
specific_columns = ['battery_power', 'dual_sim', 'int_memory', 'price_range']

df_sub = df[specific_columns]

df_sub

Unnamed: 0,battery_power,dual_sim,int_memory,price_range
0,842,0,7,1
1,1021,1,53,2
2,563,1,41,2
3,615,0,10,2
4,1821,0,44,1
...,...,...,...,...
1995,794,1,2,0
1996,1965,1,39,2
1997,1911,1,36,3
1998,1512,0,46,0


### Basic Filtering

Example: Getting all rows where `price_range == 1`

In [4]:
condition = df['price_range'] == 1

df_phone_prices_1 = df[condition]

df_phone_prices_1

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
5,1859,0,0.5,1,3,0,22,0.7,164,1,...,1004,1654,1067,17,1,10,1,0,0,1
12,1815,0,2.8,0,2,0,33,0.6,159,4,...,607,748,1482,18,0,2,1,0,0,1
18,1131,1,0.5,1,11,0,49,0.6,101,5,...,658,878,1835,19,13,16,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1981,1454,0,2.6,0,8,0,6,0.4,199,3,...,698,1018,1300,10,0,2,0,0,1,1
1984,797,0,2.2,1,0,0,37,0.9,144,7,...,206,1167,2216,9,5,6,1,0,0,1
1986,1139,1,0.9,1,6,1,58,0.5,161,2,...,742,999,1850,9,4,8,1,0,0,1
1988,1547,1,2.9,0,2,0,57,0.4,114,1,...,347,957,1620,9,2,19,0,1,1,1


## Extracting Data

### Split Data into `x` (data without labels) and `y` (labels)

* To turn `y` to one-hot encoded values, which is assumed to be a one-dimensional list of possible values, we can use get_dummies function of pandas (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html)

In [5]:
x = df.iloc[:,:-1]

x

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,14,1222,1890,668,13,4,19,1,1,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,3,915,1965,2032,11,10,16,1,1,1
1997,1911,0,0.9,1,1,1,36,0.7,108,8,3,868,1632,3057,9,1,5,1,1,0
1998,1512,0,0.9,0,4,1,46,0.1,145,5,5,336,670,869,18,10,19,1,1,1


In [6]:
y = df['price_range']

y = pd.get_dummies(y)

y

Unnamed: 0,0,1,2,3
0,0,1,0,0
1,0,0,1,0
2,0,0,1,0
3,0,0,1,0
4,0,1,0,0
...,...,...,...,...
1995,1,0,0,0
1996,0,0,1,0
1997,0,0,0,1
1998,1,0,0,0


### Perform Train Test Split where `ratio` is the size of our training

Use `sklearn`'s `train_test_split` function. https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [7]:
ratio = 0.8
test_size = 1 - ratio

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=0)

### Save Train Test Output as CSV Files

Use pandas `to_csv('filename.csv', index=False)`

In [8]:
x_train.to_csv('x_train.csv', index=False)
x_test.to_csv('x_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)