In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from google.colab import files
files.upload()

In [None]:
!ls -lha kaggle.json

-rw-r--r-- 1 root root 64 Jul 14 04:48 kaggle.json


In [None]:
!pip install -q kaggle

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [None]:
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!pwd

/content


In [None]:
!kaggle datasets list

ref                                                                 title                                                size  lastUpdated          downloadCount  voteCount  usabilityRating  
------------------------------------------------------------------  --------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
rabieelkharoua/students-performance-dataset                         📚 Students Performance Dataset 📚                     66KB  2024-06-12 23:09:20          13710        284  1.0              
nelgiriyewithana/most-streamed-spotify-songs-2024                   Most Streamed Spotify Songs 2024                    496KB  2024-06-15 18:50:51          10365        213  1.0              
ihelon/coffee-sales                                                 Coffee Sales                                         10KB  2024-07-03 20:04:43           2673         55  1.0              
tarktunataalt/2023-global-country-develo

In [None]:
!kaggle datasets download -d mohamedbakhet/amazon-books-reviews

Dataset URL: https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews
License(s): CC0-1.0
Downloading amazon-books-reviews.zip to /content
 99% 1.06G/1.06G [00:17<00:00, 39.2MB/s]
100% 1.06G/1.06G [00:17<00:00, 63.9MB/s]


In [None]:
!unzip amazon-books-reviews.zip

Archive:  amazon-books-reviews.zip
  inflating: Books_rating.csv        
  inflating: books_data.csv          


In [None]:
!rm amazon-books-reviews.zip

## **File Reading Approaches**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### **Pandas:**

In [None]:
df_ratings = pd.read_csv('Books_rating.csv')
df_books = pd.read_csv('books_data.csv')

> Pandas took almost ~1 minute to load the data.

In [None]:
# Deleting the dataframes

lst = [df_ratings, df_books]
del lst
del df_ratings, df_books

### **Files:**

In [None]:
with open('Books_rating.csv', 'r') as f:
  df_ratings = f.read()

with open('books_data.csv', 'r') as f:
  df_books = f.read()

INFO:distributed.core:Event loop was unresponsive in Scheduler for 4.70s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
INFO:distributed.core:Event loop was unresponsive in Worker for 5.14s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.


> Reading by open() & read() function takes ~ 15 seconds to read the files. Which is an improvement over the Pandas library.

In [None]:
with open('Books_rating.csv', 'r') as f:
  df_ratings = f.readlines()

with open('books_data.csv', 'r') as f:
  df_books = f.readlines()

> - Reading by open() & readlines() function takes ~ 5 seconds to read the files. Which is much faster compared to read() function and Pandas library.
> - However, readlines() method returns a list where each item of the list is a complete sentence in a file. Since it appends each line to the list and then returns the entire list it will be time consuming if the file size is quite large.

### **Dask File Reading:**

In [None]:
#from dask.distributed import Client

#client = Client(n_workers=1, threads_per_worker=4, processes=False, memory_limit='2GB')
#client

In [None]:
import dask.dataframe as dd
import warnings

warnings.filterwarnings('ignore')

df_ratings = dd.read_csv('Books_rating.csv', dtype = {'Id':'object'})
df_books = dd.read_csv('books_data.csv')

#len(df_ratings), len(df_books)

> Data got loaded faster but Memory leak happened and the length of dataframe is not same.

### **Modin API:**

In [None]:
!pip install -U ipykernel
!pip install modin[all]

In [None]:
import modin.pandas as mpd

df_ratings = mpd.read_csv('Books_rating.csv')
df_books = mpd.read_csv('books_data.csv')

2024-07-13 19:27:51,074	INFO worker.py:1788 -- Started a local Ray instance.


In [None]:
len(df_ratings), len(df_books)

(3000000, 212404)

> Data loaded slowly in case of Modin and also Memory leak happened and the length of dataframe is not same as original.

### **Creating Utility File for data ingestion:**

In [None]:
!rm utility.py
!rm file.yaml

In [None]:
%%writefile file.yaml
file_type: csv
dataset_name: df_rating
file_name: Books_rating
table_name: books_rating
inbound_delimiter: ","
outbound_delimiter: "|"
skip_leading_rows: 1
columns:
    - id
    - title
    - price
    - user_id
    - profile_name
    - review_helpfulness
    - review_score
    - review_time
    - review_summary
    - review_text

Overwriting file.yaml


In [None]:
%%writefile utility.py
import logging
import os
import subprocess
import yaml
import datetime
import gc
import re


def read_config_file(filepath):
  with open(filepath, 'r') as stream:
    try:
      return yaml.safe_load(stream)
    except yaml.YAMLError as exc:
      logging.error(exc)

def replacer(string, char):
  pattern = re.escape(char) + '{2,}'
  string = re.sub(pattern, char, string)
  return string

def col_header_val(df, config):
  df_columns = df.columns
  yaml_columns = config

  #print("Original Columns: ", df_columns)
  df_columns = df_columns.str.lower()
  #print("Lower cased Columns: ", df_columns)
  df_columns = df_columns.str.strip('_')
  #print("Stripped by _ Columns: ", df_columns)
  df_columns = list(df_columns)

  for i in range(len(df_columns)):
    df_columns[i] = replacer(df_columns[i], '_')
  #print("Replaced repeating characters: ", df_columns)
    df_columns[i] = re.sub('[$%^&*@!]', '', df_columns[i])
    df_columns[i] = df_columns[i].strip('_')
  #print(len(df_columns))

  expected_col = yaml_columns
  expected_col = list(expected_col)

  if len(df_columns) == len(expected_col) and expected_col == df_columns:
    print("column name and column length validation passed")
    return 1
  else:
    print("column name and column length validation failed")
    mismatched_columns_file = set(df_columns).difference(expected_col)
    print("Columns not in YAML files", mismatched_columns_file)
    mismatched_YAML_file = set(expected_col).difference(df_columns)
    print("Columns not in uploaded file", mismatched_YAML_file)
    logging.info(f'df columns: {df_columns}')
    logging.info(f'expected columns: {expected_col}')
    return 0

Overwriting utility.py


### **Parameterizing process using YAML file:**

In [None]:
import utility as util

In [None]:
config_data = util.read_config_file('file.yaml')

df_rating = pd.read_csv('Books_rating.csv', delimiter = config_data['inbound_delimiter'])
#df_rating.head()

df_rating.columns = ['id', '_title', 'price', 'user__id', 'profile__name', 'review_helpfulness', 'review_score', 'review__time', 'review_summary', 'review_text_']

util.col_header_val(df_rating, config_data['columns'])

In [58]:
import gzip

with gzip.open('books_rating_zip.gz', 'wb') as f:
  df_rating.to_csv(f, sep = '|')