In [32]:
import pandas as pd
import numpy as np

In [33]:
# Example 1: Generate a large DataFrame with random data
np.random.seed(42)
num_rows = 10000
data = {
    'A': np.random.randint(1, 100, num_rows),
    'B': np.random.rand(num_rows),
    'C': np.random.choice(['Category1', 'Category2', 'Category3'], num_rows),
    'D': pd.date_range(start='2023-01-01', periods=num_rows, freq='D')
}
df = pd.DataFrame(data)
print("Example 1:")
print(df.memory_usage(deep=True).sum() / (1024 ** 2))  # Memory usage in MB

Example 1:
0.8202857971191406


In [34]:
# Example 2: Convert data types to reduce memory usage
df['A'] = df['A'].astype('int32')
df['B'] = df['B'].astype('int16')
print("\nExample 2:")
print(df.memory_usage(deep=True).sum() / (1024 ** 2))  # Memory usage in MB



Example 2:
0.7630653381347656


In [35]:
# Example 3: Use 'parse_dates' parameter when reading CSV
df.to_csv('large_data.csv', index=False)
df_csv = pd.read_csv('large_data.csv', parse_dates=['D'])
print("\nExample 3:")
print(df_csv.memory_usage(deep=True).sum() / (1024 ** 2))  # Memory usage in MB



Example 3:
0.8584327697753906


In [36]:
# Example 4: Reduce memory usage with chunking
chunk_size = 1000000
df_chunk = pd.read_csv('large_data.csv', parse_dates=['D'], chunksize=chunk_size)
total_memory_usage = 0
for chunk in df_chunk:
    total_memory_usage += chunk.memory_usage(deep=True).sum()
print("\nExample 4:")
print(total_memory_usage / (1024 ** 2))  # Memory usage in MB



Example 4:
0.8584327697753906


In [37]:
# Example 5: Convert text columns to Categorical
df['C'] = df['C'].astype('category')
print("\nExample 6:")
print(df.memory_usage(deep=True).sum() / (1024 ** 2))  # Memory usage in MB


Example 6:
0.14346885681152344


```python
# Example 6: Remove unused columns
df.drop(columns=['C'], inplace=True)
print("\nExample 5:")
print(df.memory_usage(deep=True).sum() / (1024 ** 2))  # Memory usage in MB
```

In [38]:
# Example 7: Downsizing DataFrame - Splitting into smaller DataFrames
df_list = [df.iloc[i:i+1000000] for i in range(0, len(df), 1000000)]
print("\nExample 7:")
print(df_list[0].memory_usage(deep=True).sum() / (1024 ** 2))  # Memory usage in MB



Example 7:
0.14346885681152344


In [39]:
# Example 8: Use SparseDataFrame for mostly zero or missing values
df_sparse = df.copy()
df_sparse['A'] = pd.arrays.SparseArray(df_sparse['A'])
print("\nExample 8:")
print(df_sparse.memory_usage(deep=True).sum() / (1024 ** 2))  # Memory usage in MB



Example 8:
0.18161582946777344


In [40]:
# Example 9: Encoding categorical variables with integer encoding
df['C'] = df['C'].cat.codes
print("\nExample 9:")
print(df.memory_usage(deep=True).sum() / (1024 ** 2))  # Memory usage in MB




Example 9:
0.14317703247070312


In [41]:
# Example 10: Encoding categorical variables with one-hot encoding
df_one_hot = pd.get_dummies(df, columns=['C'])
print("\nExample 10:")
print(df_one_hot.memory_usage(deep=True).sum() / (1024 ** 2))  # Memory usage in MB


Example 10:
0.16225051879882812


In [42]:
# Example 11: Use pd.to_numeric for numeric columns
df_numeric = df.copy()
df_numeric['A'] = pd.to_numeric(df_numeric['A'])
df_numeric['B'] = pd.to_numeric(df_numeric['B'], downcast='integer')
print("\nExample 11:")
print(df_numeric.memory_usage(deep=True).sum() / (1024 ** 2))  # Memory usage in MB



Example 11:
0.13364028930664062


In [43]:
# Example 12: Use astype to optimize memory usage
df['A'] = df['A'].astype('int8')
df['B'] = df['B'].astype('int16')
df['D'] = df['D'].astype('datetime64')
print("\nExample 12:")
print(df.memory_usage(deep=True).sum() / (1024 ** 2))  # Memory usage in MB



Example 12:
0.11456680297851562


  df['D'] = df['D'].astype('datetime64')


In [44]:
# Example 13: Memory reduction during aggregation
grouped_mean = df.groupby('B')['A'].mean()
print("\nExample 13:")
print(grouped_mean.memory_usage(deep=True) / (1024 ** 2))  # Memory usage in MB



Example 13:
1.52587890625e-05


In [46]:
# Example 14: Store DataFrames in HDF5 format
df.to_hdf('large_data.h5', key='df', mode='w')
df_hdf = pd.read_hdf('large_data.h5', key='df')
print("\nExample 14:")
print(df_hdf.memory_usage(deep=True).sum() / (1024 ** 2))  # Memory usage in MB



Example 14:
0.19073486328125


In [47]:
# Example 15: Compress DataFrames using gzip or bzip2
df.to_csv('large_data.csv.gz', index=False, compression='gzip')
df_gzip = pd.read_csv('large_data.csv.gz', parse_dates=['D'])
print("\nExample 15:")
print(df_gzip.memory_usage(deep=True).sum() / (1024 ** 2))  # Memory usage in MB



Example 15:
0.3053016662597656


In [48]:
# Example 16: Remove duplicate values
df.drop_duplicates(inplace=True)
print("\nExample 16:")
print(df.memory_usage(deep=True).sum() / (1024 ** 2))  # Memory usage in MB



Example 16:
0.19073486328125


In [49]:
# Example 17: Reusing DataFrame's memory using copy
df_copy = df.copy()
print("\nExample 17:")
print(df_copy.memory_usage(deep=True).sum() / (1024 ** 2))  # Memory usage in MB



Example 17:
0.19073486328125


In [51]:
# Example 18: Using external memory storage with Dask DataFrame
import dask.dataframe as dd
df_dask = dd.read_csv('large_data.csv')
print("\nExample 18:")
print(df_dask.memory_usage(deep=True).sum().compute() / (1024 ** 2))  # Memory usage in MB



Example 18:
1.4211006164550781


In [52]:
# Example 19: Avoid using 'object' data type
df_no_object = df.copy()
df_no_object['C'] = df_no_object['C'].astype('string')
print("\nExample 19:")
print(df_no_object.memory_usage(deep=True).sum() / (1024 ** 2))  # Memory usage in MB



Example 19:
0.7343292236328125


In [53]:
# Example 20: Using custom data types
df_custom = df.copy()
df_custom['A'] = df_custom['A'].astype(pd.UInt16Dtype())
print("\nExample 20:")
print(df_custom.memory_usage(deep=True).sum() / (1024 ** 2))  # Memory usage in MB



Example 20:
0.209808349609375
