# ETL sample files

The sample ETL is only used once. Its purpose is to create smaller versions of the files in order to persist a copy in the main repository. As a definition, 500 listings will be randomly selected to compose the sample files.

In [1]:
### Import modules

import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [2]:
### Constants

CSV_PATH_ORIGIN = './OriginalData_csv/'
CSV_PATH_TARGET = './OriginalData_csv_sample/'
RANDOM_SEED = 888
SAMPLES = 500

In [3]:
### Import samples of files

df_calendar = pd.read_csv(CSV_PATH_ORIGIN + 'calendar.csv')
df_listings = pd.read_csv(CSV_PATH_ORIGIN + 'listings.csv')
df_listings_2 = pd.read_csv(CSV_PATH_ORIGIN + 'listings-2.csv')
df_reviews = pd.read_csv(CSV_PATH_ORIGIN + 'reviews.csv')
df_reviews_2 = pd.read_csv(CSV_PATH_ORIGIN + 'reviews-2.csv')

In [4]:
### Extract random sample from listings and compute sample dataframes

srs_sample = df_listings.sample(SAMPLES, random_state = RANDOM_SEED)['id']

# Compute sample dataframes
df_calendar_sample = df_calendar[df_calendar['listing_id'].isin(srs_sample)]
df_listings_sample = df_listings[df_listings['id'].isin(srs_sample)]
df_listings_2_sample = df_listings_2[df_listings_2['id'].isin(srs_sample)]
df_reviews_sample = df_reviews[df_reviews['listing_id'].isin(srs_sample)]
df_reviews_2_sample = df_reviews_2[df_reviews_2['listing_id'].isin(srs_sample)]

In [5]:
### Export files

df_calendar_sample.to_csv(CSV_PATH_TARGET + 'calendar_sample.csv', index = False)
df_listings_sample.to_csv(CSV_PATH_TARGET + 'listings_sample.csv', index = False)
df_listings_2_sample.to_csv(CSV_PATH_TARGET + 'listings-2_sample.csv', index = False)
df_reviews_sample.to_csv(CSV_PATH_TARGET + 'reviews_sample.csv', index = False)
df_reviews_2_sample.to_csv(CSV_PATH_TARGET + 'reviews-2_sample.csv', index = False)