In [1]:
import numpy as np
import pandas as pd

# Create our Dataset

In [2]:
# Creating Function to generate dataset
def get_dataset(size):
    """
    Generate a fake dataset with specified size.

    Parameters:
    size (int): The size of the dataset.

    Returns:
    DataFrame: A DataFrame containing fake data.
    """
    # Create Fake Dataset
    df = pd.DataFrame()
    df['size'] = np.random.choice(['big', 'medium', 'small'], size)
    df['age'] = np.random.randint(1, 10, size)
    df['team'] = np.random.choice(['red', 'blue', 'yellow', 'green'], size)
    df['win'] = np.random.choice(['yes', 'no'], size)
    dates = pd.date_range('2020-01-01', '2022-12-31')
    df['date'] = np.random.choice(dates, size)
    df['prob'] = np.random.uniform(0, 1, size)
    return df


In [3]:
# Creating Function to set data types
def set_dtypes(df):
    """
    Set appropriate data types for DataFrame columns.

    Parameters:
    df (DataFrame): Input DataFrame.

    Returns:
    DataFrame: DataFrame with updated data types.
    """
    df['size'] = df['size'].astype('category')
    df['team'] = df['team'].astype('category')
    df['age'] = df['age'].astype('int16')
    df['dq'] = df['win'].map({'yes': True, 'no': False})
    df['prob'] = df['prob'].astype('float16')
    return df


# CSV Format Read and Write

### Example 1

In [4]:
# Generate a dataset with 1,000,000 records
df = get_dataset(1_000_000)

# Save the DataFrame to a CSV file without including the index
df.to_csv('test_csv.csv', index=False)

# Display information about the saved CSV file
!ls -GFlash test_csv.csv

92176 -rw-r--r--@ 1 praveensingh  staff    45M Mar 17 20:55 test_csv.csv


In [5]:
# Read the saved CSV file back into a DataFrame and display the first few rows
df = pd.read_csv('test_csv.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   size    1000000 non-null  object 
 1   age     1000000 non-null  int64  
 2   team    1000000 non-null  object 
 3   win     1000000 non-null  object 
 4   date    1000000 non-null  object 
 5   prob    1000000 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 45.8+ MB


### Example 2

In [6]:
# Generate a dataset with 1,000,000 records and save it to a CSV file with the index included
df = get_dataset(1_000_000)
df.to_csv('test_csv.csv', index=True)

# Display information about the saved CSV file
!ls -GFlash test_csv.csv

106512 -rw-r--r--@ 1 praveensingh  staff    51M Mar 17 20:55 test_csv.csv


In [7]:
# Read the saved CSV file back into a DataFrame with the first column as the index and display the first few rows
df = pd.read_csv('test_csv.csv', index_col=[0])
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   size    1000000 non-null  object 
 1   age     1000000 non-null  int64  
 2   team    1000000 non-null  object 
 3   win     1000000 non-null  object 
 4   date    1000000 non-null  object 
 5   prob    1000000 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 53.4+ MB


### Example 3

In [8]:
# Save DataFrame to CSV without index
df.to_csv('test_csv.csv', index=False)

# Read CSV file back into DataFrame with specified data types
dtype = {'size': 'category', 'team': 'category', 'age': 'int16', 'dq': 'bool', 'prob': 'float16'}
df = pd.read_csv('test_csv.csv', dtype=dtype)

# Check DataFrame info to confirm the data types
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype   
---  ------  --------------    -----   
 0   size    1000000 non-null  category
 1   age     1000000 non-null  int16   
 2   team    1000000 non-null  category
 3   win     1000000 non-null  object  
 4   date    1000000 non-null  object  
 5   prob    1000000 non-null  float16 
dtypes: category(2), float16(1), int16(1), object(2)
memory usage: 21.0+ MB


### Example 4

In [9]:
df = pd.read_csv('test_csv.csv',
                dtype= {
                    'size': 'category',
                    'age': 'int16',  
                    'test': 'category'
                })
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype   
---  ------  --------------    -----   
 0   size    1000000 non-null  category
 1   age     1000000 non-null  int16   
 2   team    1000000 non-null  object  
 3   win     1000000 non-null  object  
 4   date    1000000 non-null  object  
 5   prob    1000000 non-null  float64 
dtypes: category(1), float64(1), int16(1), object(3)
memory usage: 33.4+ MB


# *****************************************************************

# SPEED TEST OF { 'CSV', 'PICKLE', 'PARQUET', 'FEATHER' }

# 1. CSV Format

### Example 1. Without Changing Dtypes
- 45 MB >> size
- 1.39 s  >> to save
- 286 ms >> to read

In [10]:
df = get_dataset(1_000_000)
%timeit df.to_csv('test_csv.csv', index=False)
%timeit df_csv = pd.read_csv('test_csv.csv')

1.43 s ± 31.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
316 ms ± 14.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
! ls -GFlash test_csv.csv

92176 -rw-r--r--@ 1 praveensingh  staff    45M Mar 17 20:55 test_csv.csv


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   size    1000000 non-null  object        
 1   age     1000000 non-null  int64         
 2   team    1000000 non-null  object        
 3   win     1000000 non-null  object        
 4   date    1000000 non-null  datetime64[ns]
 5   prob    1000000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 45.8+ MB


#### Example 2. With Changing Dtypes

- 38 MB >> size
- 1.17 s  >> to save
- 285 ms >> to read

In [13]:
df = get_dataset(1_000_000)
df = set_dtypes(df)
%timeit df.to_csv('test_csv.csv', index=False)
%timeit df_csv = pd.read_csv('test_csv.csv')

1.19 s ± 28.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
300 ms ± 9.74 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
! ls -GFlash test_csv.csv

79888 -rw-r--r--@ 1 praveensingh  staff    38M Mar 17 20:55 test_csv.csv


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 7 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   size    1000000 non-null  category      
 1   age     1000000 non-null  int16         
 2   team    1000000 non-null  category      
 3   win     1000000 non-null  object        
 4   date    1000000 non-null  datetime64[ns]
 5   prob    1000000 non-null  float16       
 6   dq      1000000 non-null  bool          
dtypes: bool(1), category(2), datetime64[ns](1), float16(1), int16(1), object(1)
memory usage: 21.9+ MB


# *****************************************************************

# 2. Pickle Format

### Example 1. Without Changing Dtypes
- 43 MB Size
- 395 ms >> to save
- 128 ms >> to read

In [16]:
df = get_dataset(1_000_000)
%timeit df.to_pickle('test.pickle')
%timeit df_pickle = pd.read_pickle('test.pickle') 

511 ms ± 37.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
134 ms ± 5.13 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [17]:
! ls -GFlash test.pickle

87264 -rw-r--r--@ 1 praveensingh  staff    43M Mar 17 20:56 test.pickle


### Example 2. With Changing Dtypes 

- 20 MB Size
- 88 ms >> to save
- 36 ms >> to read

In [18]:
df = get_dataset(1_000_000)
df = set_dtypes(df)
%timeit df.to_pickle('test.pickle')
%timeit df_pickle = pd.read_pickle('test.pickle') 

104 ms ± 6.38 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
40.2 ms ± 2.99 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [19]:
! ls -GFlash test.pickle

40144 -rw-r--r--@ 1 praveensingh  staff    20M Mar 17 20:56 test.pickle


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 7 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   size    1000000 non-null  category      
 1   age     1000000 non-null  int16         
 2   team    1000000 non-null  category      
 3   win     1000000 non-null  object        
 4   date    1000000 non-null  datetime64[ns]
 5   prob    1000000 non-null  float16       
 6   dq      1000000 non-null  bool          
dtypes: bool(1), category(2), datetime64[ns](1), float16(1), int16(1), object(1)
memory usage: 21.9+ MB


# *****************************************************************

# 3. Parquet Format

- !pip install pyarrow
- !pip install fastparquet

### Example 1 . Without Changing Dtypes

- 10 MB Size
- 164 ms >> to save
- 62 ms >> to read

In [21]:
df = get_dataset(1_000_000)
%timeit df.to_parquet('test.parquet')
%timeit df_parquet = pd.read_parquet('test.parquet') 

201 ms ± 27.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
64.5 ms ± 1.85 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [22]:
! ls -GFlash test.parquet

22704 -rw-r--r--@ 1 praveensingh  staff    10M Mar 17 20:56 test.parquet


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   size    1000000 non-null  object        
 1   age     1000000 non-null  int64         
 2   team    1000000 non-null  object        
 3   win     1000000 non-null  object        
 4   date    1000000 non-null  datetime64[ns]
 5   prob    1000000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 45.8+ MB


### Example 2 . With Changing Dtypes

- 4.2 MB Size
- 103 ms >> to save
- 27.5 ms >> to read

In [24]:
df = get_dataset(1_000_000)
df = set_dtypes(df)
%timeit df.to_parquet('test.parquet')
%timeit df_parquet = pd.read_parquet('test.parquet') 

103 ms ± 1.32 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
29 ms ± 581 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [25]:
! ls -GFlash test.parquet

10416 -rw-r--r--@ 1 praveensingh  staff   4.2M Mar 17 20:56 test.parquet


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 7 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   size    1000000 non-null  category      
 1   age     1000000 non-null  int16         
 2   team    1000000 non-null  category      
 3   win     1000000 non-null  object        
 4   date    1000000 non-null  datetime64[ns]
 5   prob    1000000 non-null  float16       
 6   dq      1000000 non-null  bool          
dtypes: bool(1), category(2), datetime64[ns](1), float16(1), int16(1), object(1)
memory usage: 21.9+ MB


In [27]:
pd.read_parquet('test.parquet', columns = ['date', 'win'])

Unnamed: 0,date,win
0,2021-08-08,yes
1,2022-04-30,no
2,2021-10-07,yes
3,2022-02-18,yes
4,2021-07-07,yes
...,...,...
999995,2020-10-26,yes
999996,2022-12-06,no
999997,2020-08-11,yes
999998,2022-03-27,no


# *****************************************************************

# 4. Feather Format

### Example 1 . Without Changing Dtypes

- 29 MB Size
- 84.5 ms >> to save
- 56.7 ms >> to read

In [28]:
df = get_dataset(1_000_000)
%timeit df.to_feather('test.feather')
%timeit df_feather = pd.read_feather('test.feather') 

82.7 ms ± 1.37 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
58.2 ms ± 1.73 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [29]:
! ls -GFlash test.feather

59656 -rw-r--r--@ 1 praveensingh  staff    29M Mar 17 20:56 test.feather


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   size    1000000 non-null  object        
 1   age     1000000 non-null  int64         
 2   team    1000000 non-null  object        
 3   win     1000000 non-null  object        
 4   date    1000000 non-null  datetime64[ns]
 5   prob    1000000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 45.8+ MB


### Example 2 . With Changing Dtypes

- 12 MB Size
- 41.9 ms >> to save
- 20.9 ms >> to read

In [31]:
df = get_dataset(1_000_000)
df = set_dtypes(df)
%timeit df.to_feather('test.feather')
%timeit df_feather = pd.read_feather('test.feather') 

43.1 ms ± 3.32 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
22.2 ms ± 441 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [32]:
! ls -GFlash test.feather

24968 -rw-r--r--@ 1 praveensingh  staff    12M Mar 17 20:57 test.feather


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 7 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   size    1000000 non-null  category      
 1   age     1000000 non-null  int16         
 2   team    1000000 non-null  category      
 3   win     1000000 non-null  object        
 4   date    1000000 non-null  datetime64[ns]
 5   prob    1000000 non-null  float16       
 6   dq      1000000 non-null  bool          
dtypes: bool(1), category(2), datetime64[ns](1), float16(1), int16(1), object(1)
memory usage: 21.9+ MB


# 5. CSV vs Pickle vs Parquet vs Feather

In [34]:
print("Reading and Writing CSV:")
# Generate a dataset with 5,000,000 records
df = get_dataset(5_000_000)

# Set the data types for the DataFrame columns
df = set_dtypes(df)

# Measure the time taken to write the DataFrame to a CSV file
%time df.to_csv('test.csv', index=False)

# Measure the time taken to read the CSV file into a DataFrame
%time df_csv = pd.read_csv('test.csv')


Reading and Writing CSV:
CPU times: user 5.73 s, sys: 131 ms, total: 5.86 s
Wall time: 5.9 s
CPU times: user 1.33 s, sys: 152 ms, total: 1.48 s
Wall time: 1.59 s


In [35]:
print("Reading and Writing Pickle:")
# Generate a dataset with 5,000,000 records
df = get_dataset(5_000_000)

# Set the data types for the DataFrame columns
df = set_dtypes(df)

# Measure the time taken to write the DataFrame to a pickle file
%time df.to_pickle('test.pickle')

# Measure the time taken to read the pickle file into a DataFrame
%time df_pickle = pd.read_pickle('test.pickle')


Reading and Writing Pickle:
CPU times: user 563 ms, sys: 30.6 ms, total: 594 ms
Wall time: 606 ms
CPU times: user 135 ms, sys: 41.7 ms, total: 176 ms
Wall time: 182 ms


In [36]:
print("Reading and Writing Parquet:")
# Generate a dataset with 5,000,000 records
df = get_dataset(5_000_000)

# Set the data types for the DataFrame columns
df = set_dtypes(df)

# Measure the time taken to write the DataFrame to a Parquet file
%time df.to_parquet('test.parquet')

# Measure the time taken to read the Parquet file into a DataFrame
%time df_parquet = pd.read_parquet('test.parquet')


Reading and Writing Parquet:
CPU times: user 539 ms, sys: 14.6 ms, total: 553 ms
Wall time: 518 ms
CPU times: user 249 ms, sys: 75.5 ms, total: 325 ms
Wall time: 122 ms


In [37]:
print("Reading and Writing Feather:")
# Generate a dataset with 5,000,000 records
df = get_dataset(5_000_000)

# Set the data types for the DataFrame columns
df = set_dtypes(df)

# Measure the time taken to write the DataFrame to a Feather file
%time df.to_feather('test.feather')

# Measure the time taken to read the Feather file into a DataFrame
%time df_feather = pd.read_feather('test.feather')


Reading and Writing Feather:
CPU times: user 278 ms, sys: 26.3 ms, total: 304 ms
Wall time: 198 ms
CPU times: user 138 ms, sys: 27.8 ms, total: 165 ms
Wall time: 92 ms


In [38]:
# Check the file sizes of the generated files
!ls -GFlash test.csv test.pickle test.parquet test.feather

393232 -rw-r--r--@ 1 praveensingh  staff   190M Mar 17 20:57 test.csv
123264 -rw-r--r--@ 1 praveensingh  staff    60M Mar 17 20:57 test.feather
 45072 -rw-r--r--@ 1 praveensingh  staff    21M Mar 17 20:57 test.parquet
203704 -rw-r--r--@ 1 praveensingh  staff    98M Mar 17 20:57 test.pickle


# Done !!!