In [1]:
import pandas as pd
import numpy as np
import time

In [5]:
# Example 1: Generate a large DataFrame with random data
np.random.seed(42)
num_rows = 10000
data = {
    'A': np.random.randint(1, 100, num_rows),
    'B': np.random.rand(num_rows),
    'C': np.random.choice(['Category1', 'Category2', 'Category3'], num_rows),
    'D': pd.date_range(start='2023-01-01', periods=num_rows, freq='D')
}
df = pd.DataFrame(data)
print("Example 1:")
print(df.head())

Example 1:
    A         B          C          D
0  52  0.702325  Category3 2023-01-01
1  93  0.851969  Category3 2023-01-02
2  15  0.356609  Category1 2023-01-03
3  72  0.261397  Category3 2023-01-04
4  61  0.946599  Category3 2023-01-05


In [6]:
# Example 2: Use vectorized operations for calculations
start_time = time.time()
df['E'] = df['A'] + df['B']
end_time = time.time()
print("\nExample 2:")
print(f"Time taken: {end_time - start_time} seconds")
print(df.head())


Example 2:
Time taken: 0.0009992122650146484 seconds
    A         B          C          D          E
0  52  0.702325  Category3 2023-01-01  52.702325
1  93  0.851969  Category3 2023-01-02  93.851969
2  15  0.356609  Category1 2023-01-03  15.356609
3  72  0.261397  Category3 2023-01-04  72.261397
4  61  0.946599  Category3 2023-01-05  61.946599


In [7]:
# Example 3: Avoid using iterrows() for iteration
start_time = time.time()
result = []
for index, row in df.iterrows():
    result.append(row['A'] * row['B'])
df['F'] = result
end_time = time.time()
print("\nExample 3:")
print(f"Time taken: {end_time - start_time} seconds")
print(df.head())


Example 3:
Time taken: 0.22836971282958984 seconds
    A         B          C          D          E          F
0  52  0.702325  Category3 2023-01-01  52.702325  36.520899
1  93  0.851969  Category3 2023-01-02  93.851969  79.233141
2  15  0.356609  Category1 2023-01-03  15.356609   5.349134
3  72  0.261397  Category3 2023-01-04  72.261397  18.820607
4  61  0.946599  Category3 2023-01-05  61.946599  57.742513


In [8]:
# Example 4: Use apply() with a lambda function for faster calculation
start_time = time.time()
df['G'] = df.apply(lambda row: row['A'] * row['B'], axis=1)
end_time = time.time()
print("\nExample 4:")
print(f"Time taken: {end_time - start_time} seconds")
print(df.head())


Example 4:
Time taken: 0.05038094520568848 seconds
    A         B          C          D          E          F          G
0  52  0.702325  Category3 2023-01-01  52.702325  36.520899  36.520899
1  93  0.851969  Category3 2023-01-02  93.851969  79.233141  79.233141
2  15  0.356609  Category1 2023-01-03  15.356609   5.349134   5.349134
3  72  0.261397  Category3 2023-01-04  72.261397  18.820607  18.820607
4  61  0.946599  Category3 2023-01-05  61.946599  57.742513  57.742513


In [9]:
# Example 5: Use pd.eval() for faster arithmetic operations
start_time = time.time()
df['H'] = pd.eval('df.A * df.B')
end_time = time.time()
print("\nExample 5:")
print(f"Time taken: {end_time - start_time} seconds")
print(df.head())


Example 5:
Time taken: 0.011702537536621094 seconds
    A         B          C          D          E          F          G  \
0  52  0.702325  Category3 2023-01-01  52.702325  36.520899  36.520899   
1  93  0.851969  Category3 2023-01-02  93.851969  79.233141  79.233141   
2  15  0.356609  Category1 2023-01-03  15.356609   5.349134   5.349134   
3  72  0.261397  Category3 2023-01-04  72.261397  18.820607  18.820607   
4  61  0.946599  Category3 2023-01-05  61.946599  57.742513  57.742513   

           H  
0  36.520899  
1  79.233141  
2   5.349134  
3  18.820607  
4  57.742513  
