In [7]:
!pip install modin
!pip install ray
!pip install dask

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
#Download dataset from Kaggle
!pip install -q kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d hm-land-registry/uk-housing-prices-paid
!unzip uk-housing-prices-paid.zip

mkdir: cannot create directory ‘/root/.kaggle’: File exists
uk-housing-prices-paid.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  uk-housing-prices-paid.zip
  inflating: price_paid_records.csv  


# **Read CSV with Pandas, Dask, Modin/Ray**

In [5]:
import pandas as pd
import dask.dataframe as dd
import modin.pandas as mpd
import ray
import time

In [11]:
#Pandas to read the CSV

start = time.time()
df=pd.read_csv('/content/price_paid_records.csv')
end = time.time()
print('Time to read the CSV (pandas): ',end - start, 'seconds')

Time to read the CSV (pandas):  49.24463200569153 seconds


In [None]:
#Modin/Ray to read the CSV

start = time.time()
ray.shutdown()
ray.init()
mpd.read_csv('/content/price_paid_records.csv')
end = time.time()
print('Time to read the CSV (modin/ray): ',end - start, 'seconds')

[2m[36m(deploy_ray_func pid=1619)[0m tcmalloc: large alloc 1202847744 bytes == 0x4b12000 @  0x7f58c76331e7 0x4a3940 0x5b438c 0x5d0ccd 0x5939af 0x516337 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x4bca8a 0x5134a6 0x549576 0x4bca8a 0x5134a6 0x4bc98a 0x7f58c430ce02 0x7f58c43a0db6 0x7f58c4313306 0x7f58c44876ab 0x7f58c43e888f 0x7f58c44af7d3 0x7f58c44b072a 0x7f58c44c218e 0x7f58c449b530 0x7f58c46bff06 0x7f58c466ca3e 0x7f58c466cc96 0x7f58c4b03cab 0x7f58c4b04ee1
[2m[36m(deploy_ray_func pid=1620)[0m tcmalloc: large alloc 1202847744 bytes == 0x44a2000 @  0x7f3db72661e7 0x4a3940 0x5b438c 0x5d0ccd 0x5939af 0x516337 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x4bca8a 0x5134a6 0x549576 0x4bca8a 0x5134a6 0x4bc98a 0x7f3db3f3fe02 0x7f3db3fd3db6 0x7f3db3f46306 0x7f3db40ba6ab 0x7f3db401b88f 0x7f3db40e27d3 0x7f3db40e372a 0x7f3db40f518e 0x7f3db40ce530 0x7f3db42f2f06 0x7f3db429fa3e 0x7f3db429fc96 0x7f3db4736cab 0x7f3db4737ee1


In [None]:
#Dask to read the CSV

start = time.time()
dd.read_csv('/content/price_paid_records.csv')
end = time.time()
print('Time to read the CSV (dask): ',end - start, 'seconds')

**Dask took the least time reading the CSV file!**


# **Clean the Column Names**

In [None]:
# Remove spaces and special chars from the cols

data.columns=data.columns.str.lower()
data.columns = data.columns.str.replace(' ', '')
data.columns=data.columns.str.replace('[^\w]','_',regex=True)
print(data.columns)

# **Validation**



In [None]:
%%writefile utility.py
import yaml
import logging
import os
import subprocess
import pandas as pd
import re
import gc
import datetime

def read_config_file(filepath):
  with open(filepath, 'r') as stream:
    try:
      return yaml.safe_load(stream)
    except yaml.YAMLerror as exc:
      logging.error(exc)

def replacer(string, char):
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string) 
    return string

def col_header_val(df, table_config):
  df.columns=df.columns.str.lower()
  df.columns=df.columns.str.replace('[^\w]','_',regex=True)
  df.columns = list(map(lambda x: x.strip('_'), list(df.columns)))
  df.columns = list(map(lambda x: replacer(x,'_'), list(df.columns)))
  expected_col = list(map(lambda x: x.lower(), table_config['columns']))
  expected_col.sort()
  df.columns = list(map(lambda x: x.lower(), list(df.columns)))
  df=df.reindex(sorted(df.columns), axis=1)
  if len(df.columns) == len(expected_col) and list(expected_col) == list(df.columns):
    print("Column name and column length validation passed")
    return 1
  else:
    print("Column name and column length validation failed")
    mismatched_columns_file = list(set(df.columns).difference(expected_col))
    print("Following file columns are not in the YAML file", mismatched_columns_file)
    missing_YAML_file = list(set(expected_col).difference(df.columns))
    print("Following YAML columns are not in the file uploaded", missing_YAML_file)
    logging.info(f'df columns: {df.columns}')
    logging.info(f'expected columns: {expected_col}')
    return 0

In [None]:
%%writefile store.yaml
file_type: csv
dataset_name: testfile
file_name: price_paid_records
table_name: endsurv
inbound_delimiter: ','
outbound_delimiter: '|'
skip_leading_rows: 1
columns:
  - transaction_unique_identifier
  - price
  - date_of_transfer
  - property_type
  - old_new
  - duration
  - town_city
  - district
  - county
  - ppdcategory_type
  - record_status_monthly_file_only

In [None]:
# Read config file

import utility as util
config_data = util.read_config_file("store.yaml")

In [None]:
config_data['inbound_delimiter']

In [None]:
#data of the config file
config_data

In [None]:
#Read the file using config file
file_type = config_data['file_type']
source_file = "content/" + config_data['file_name'] + f'.{file_type}'
df = pd.read_csv(source_file,config_data['inbound_delimiter'])
df.head()

In [None]:
#Validate the Header of the file
util.col_header_val(df,config_data)

In [None]:
print("Columns of files are:" , df.columns)
print("Columns of YAML are:" , config_data['columns'])

In [None]:
if util.col_header_val(df,config_data)==0:
    print("Validation failed")
    print("Columns of the file does not match the YAML")
else:
    print("Column validation passed")
    print('Preview of the data\n', df.head())

# **Save pipe separated file as .gz**

In [None]:
#csv to gz

import gzip
import csv

df.to_csv('dfgz.gz',
      sep='|',
      header=True,
      index=False,
      quoting=csv.QUOTE_ALL,
      compression='gzip',
      quotechar='"',
      doublequote=True,
      line_terminator='\n')

In [None]:
# Number of cols in the gz file
import os
entries = os.listdir('./dfgz.csv.gz')
for entry in entries:
    print(entry)

# **Summary of the File**

In [None]:
#Size of the CSV file

import os
data=dd.read_csv('C:/Users/jeeye/Documents/DG/price_paid_records.csv')
print('Size of the CSV file: ', str(os.path.getsize(csvfile), 'Bytes'))

In [None]:
#Number of rows/cols
print('Number of Rows: ',len(df.index))
print('Number of columns:', len(df.columns))