# Create a sample of the data to validate scripts  

As suggested by Jeremy Howard in the MOOC at <http://course.fast.ai> ,it is often useful to make a small sample of the data on which you can try your scripts to look for any obvious failures before running them on full data. For this to work easily preseve the file names for the data file but use a different base directory (sample) instead of (input) for validating scripts.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os,shutil # File Ops

In [None]:
order_products_train_df = pd.read_csv("../input/order_products__train.csv")
order_products_prior_df = pd.read_csv("../input/order_products__prior.csv")
orders_df = pd.read_csv("../input/orders.csv")

In [None]:
orders_df.head()

### Filter the orders for User 1..10

In [None]:
sample_orders_df = orders_df[orders_df.user_id <=10].reset_index(drop=True)

### Print Unique User Ids in the set of sample orders


In [None]:
sample_orders_df.user_id.unique()

### Number of orders in Sample orders

In [None]:
sample_orders_df.shape[0]

### Create Sample Order_Products for both prior and train

In [None]:
sample_order_products_train_df = \
    order_products_train_df[order_products_train_df.order_id.isin(sample_orders_df.order_id)].reset_index(drop=True)

In [None]:
sample_order_products_prior_df = \
    order_products_prior_df[order_products_prior_df.order_id.isin(sample_orders_df.order_id)].reset_index(drop=True)

### Create Sample Directory, copy meta data and data for sample orders

In [None]:
if not os.path.exists('sample'):
    os.makedirs('sample')

In [None]:
shutil.copy2('../input/aisles.csv','sample')
shutil.copy2('../input/departments.csv','sample')
shutil.copy2('../input/products.csv','sample')

In [None]:
sample_orders_df.to_csv('sample/orders.csv',index=False)
sample_order_products_train_df.to_csv('sample/order_products__train.csv',index=False)
sample_order_products_prior_df.to_csv('sample/order_products__prior.csv',index=False)

Now if you use a BASE_DIR in your scripts you can set it to 'sample' instead of '../input' and you should be able to validate the scripts quickly instead of running on the entire dataset.