In [1]:
import pandas as pd
import numpy as np

# Create a sample dataset
np.random.seed(0)
data = {
    'id': np.arange(1, 100001),
    'name': np.random.choice(['Alice', 'Bob', 'Charlie', 'David', 'Eva'], 100000),
    'age': np.random.randint(18, 80, size=100000),
    'salary': np.random.uniform(30000, 120000, size=100000),
    'department': np.random.choice(['HR', 'Engineering', 'Marketing', 'Sales'], 100000)
}
df = pd.DataFrame(data)


**1.Measure the execution time of a DataFrame operation.**

In [2]:
import time

# Measure execution time of a DataFrame operation
start_time = time.time()
df['salary_increase'] = df['salary'] * 1.05
end_time = time.time()
print(f"Execution time: {end_time - start_time} seconds")


Execution time: 0.020416736602783203 seconds


**2. Use vectorized operations instead of loops.**

In [3]:
# Using vectorized operations instead of loops
df['new_salary'] = df['salary'] * 1.05


**3. Optimize memory usage by changing data types.**

In [4]:
# Optimize memory usage by changing data types
df['id'] = df['id'].astype('int32')
df['age'] = df['age'].astype('int8')
df['salary'] = df['salary'].astype('float32')
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               100000 non-null  int32  
 1   name             100000 non-null  object 
 2   age              100000 non-null  int8   
 3   salary           100000 non-null  float32
 4   department       100000 non-null  object 
 5   salary_increase  100000 non-null  float64
 6   new_salary       100000 non-null  float64
dtypes: float32(1), float64(2), int32(1), int8(1), object(2)
memory usage: 3.9+ MB
None


**4. Load data in chunks to save memory.**

In [None]:
import pandas as pd

# Define the chunk size
chunk_size = 20000

# Initialize an empty list to store chunks
chunks = []

# Load the CSV file in chunks
for chunk in pd.read_csv('large_dataset.csv', chunksize=chunk_size):
    # Process each chunk (e.g., filtering, aggregation)
    chunks.append(chunk)

# Concatenate all chunks into a single DataFrame
df_large = pd.concat(chunks)

# Display the first few rows of the concatenated DataFrame
print(df_large.head())


**5. Use categorical data types to save memory.**

In [6]:
# Use categorical data types to save memory
df['name'] = df['name'].astype('category')
df['department'] = df['department'].astype('category')
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   id               100000 non-null  int32   
 1   name             100000 non-null  category
 2   age              100000 non-null  int8    
 3   salary           100000 non-null  float32 
 4   department       100000 non-null  category
 5   salary_increase  100000 non-null  float64 
 6   new_salary       100000 non-null  float64 
dtypes: category(2), float32(1), float64(2), int32(1), int8(1)
memory usage: 2.6 MB
None


**6. Apply parallel processing for DataFrame operations.**

In [14]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

# Sample data
np.random.seed(0)
data = {
    'id': np.arange(1, 100001),
    'name': np.random.choice(['Alice', 'Bob', 'Charlie', 'David', 'Eva'], 100000),
    'age': np.random.randint(18, 80, size=100000),
    'salary': np.random.uniform(30000, 120000, size=100000),
    'department': np.random.choice(['HR', 'Engineering', 'Marketing', 'Sales'], 100000)
}
df = pd.DataFrame(data)

# Define the function to be applied in parallel
def increase_salary(salary):
    return salary * 1.05

# Apply the function in parallel
num_cores = -1  # Use all available cores
df['parallel_new_salary'] = Parallel(n_jobs=num_cores)(delayed(increase_salary)(s) for s in df['salary'])

# Display the first few rows
print(df.head())


   id   name  age         salary   department  parallel_new_salary
0   1    Eva   40   76893.998807    Marketing         80738.698748
1   2  Alice   21   49386.795868        Sales         51856.135662
2   3  David   73  119799.220312  Engineering        125789.181327
3   4  David   64   72567.683648    Marketing         76196.067830
4   5  David   20   50129.425674    Marketing         52635.896958


**7. Profile memory usage of a DataFrame.**

In [None]:
import pandas as pd
import numpy as np
from memory_profiler import profile

# Sample data generation function
@profile
def load_data():
    np.random.seed(0)
    data = {
        'id': np.arange(1, 100001),
        'name': np.random.choice(['Alice', 'Bob', 'Charlie', 'David', 'Eva'], 100000),
        'age': np.random.randint(18, 80, size=100000),
        'salary': np.random.uniform(30000, 120000, size=100000),
        'department': np.random.choice(['HR', 'Engineering', 'Marketing', 'Sales'], 100000)
    }
    df = pd.DataFrame(data)
    return df

# Load data and profile memory usage
df = load_data()

# Display the first few rows
print(df.head())


**8. Optimize large DataFrame merging.**

In [9]:
# Create another sample DataFrame to merge
df2 = pd.DataFrame({
    'id': np.arange(1, 100001),
    'bonus': np.random.uniform(1000, 5000, size=100000)
})

# Optimize large DataFrame merging
df_merged = df.merge(df2, on='id', how='left')
print(df_merged.head())


   id   name  age         salary   department  salary_increase     new_salary  \
0   1    Eva   40   76894.000000    Marketing     80738.698748   80738.698748   
1   2  Alice   21   49386.796875        Sales     51856.135662   51856.135662   
2   3  David   73  119799.218750  Engineering    125789.181327  125789.181327   
3   4  David   64   72567.687500    Marketing     76196.067830   76196.067830   
4   5  David   20   50129.425781    Marketing     52635.896958   52635.896958   

         bonus  
0  2855.663384  
1  1735.434926  
2  2158.926010  
3  4796.768889  
4  3944.317652  


**9. Use sparse data structures for memory efficiency.**

In [10]:
# Use sparse data structures for memory efficiency
df_sparse = df.copy()
df_sparse['salary'] = pd.arrays.SparseArray(df_sparse['salary'])
print(df_sparse.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype               
---  ------           --------------   -----               
 0   id               100000 non-null  int32               
 1   name             100000 non-null  category            
 2   age              100000 non-null  int8                
 3   salary           100000 non-null  Sparse[float32, nan]
 4   department       100000 non-null  category            
 5   salary_increase  100000 non-null  float64             
 6   new_salary       100000 non-null  float64             
dtypes: Sparse[float32, nan](1), category(2), float64(2), int32(1), int8(1)
memory usage: 3.0 MB
None


**10. Use efficient file formats for saving large DataFrames.**

In [12]:
# Save DataFrame in efficient file formats
df.to_parquet('data.parquet')
df.to_feather('data.feather')
