In [1]:
import pandas as pd
import numpy as np
import time

In [5]:
# Example 1: Generate a large DataFrame with random data
np.random.seed(42)
num_rows = 10000
data = {
    'A': np.random.randint(1, 100, num_rows),
    'B': np.random.rand(num_rows),
    'C': np.random.choice(['Category1', 'Category2', 'Category3'], num_rows),
    'D': pd.date_range(start='2023-01-01', periods=num_rows, freq='D')
}
df = pd.DataFrame(data)
print("Example 1:")
print(df.head())

Example 1:
    A         B          C          D
0  52  0.702325  Category3 2023-01-01
1  93  0.851969  Category3 2023-01-02
2  15  0.356609  Category1 2023-01-03
3  72  0.261397  Category3 2023-01-04
4  61  0.946599  Category3 2023-01-05


In [6]:
# Example 2: Use vectorized operations for calculations
start_time = time.time()
df['E'] = df['A'] + df['B']
end_time = time.time()
print("\nExample 2:")
print(f"Time taken: {end_time - start_time} seconds")
print(df.head())


Example 2:
Time taken: 0.0009992122650146484 seconds
    A         B          C          D          E
0  52  0.702325  Category3 2023-01-01  52.702325
1  93  0.851969  Category3 2023-01-02  93.851969
2  15  0.356609  Category1 2023-01-03  15.356609
3  72  0.261397  Category3 2023-01-04  72.261397
4  61  0.946599  Category3 2023-01-05  61.946599


In [7]:
# Example 3: Avoid using iterrows() for iteration
start_time = time.time()
result = []
for index, row in df.iterrows():
    result.append(row['A'] * row['B'])
df['F'] = result
end_time = time.time()
print("\nExample 3:")
print(f"Time taken: {end_time - start_time} seconds")
print(df.head())


Example 3:
Time taken: 0.22836971282958984 seconds
    A         B          C          D          E          F
0  52  0.702325  Category3 2023-01-01  52.702325  36.520899
1  93  0.851969  Category3 2023-01-02  93.851969  79.233141
2  15  0.356609  Category1 2023-01-03  15.356609   5.349134
3  72  0.261397  Category3 2023-01-04  72.261397  18.820607
4  61  0.946599  Category3 2023-01-05  61.946599  57.742513


In [8]:
# Example 4: Use apply() with a lambda function for faster calculation
start_time = time.time()
df['G'] = df.apply(lambda row: row['A'] * row['B'], axis=1)
end_time = time.time()
print("\nExample 4:")
print(f"Time taken: {end_time - start_time} seconds")
print(df.head())


Example 4:
Time taken: 0.05038094520568848 seconds
    A         B          C          D          E          F          G
0  52  0.702325  Category3 2023-01-01  52.702325  36.520899  36.520899
1  93  0.851969  Category3 2023-01-02  93.851969  79.233141  79.233141
2  15  0.356609  Category1 2023-01-03  15.356609   5.349134   5.349134
3  72  0.261397  Category3 2023-01-04  72.261397  18.820607  18.820607
4  61  0.946599  Category3 2023-01-05  61.946599  57.742513  57.742513


In [9]:
# Example 5: Use pd.eval() for faster arithmetic operations
start_time = time.time()
df['H'] = pd.eval('df.A * df.B')
end_time = time.time()
print("\nExample 5:")
print(f"Time taken: {end_time - start_time} seconds")
print(df.head())


Example 5:
Time taken: 0.011702537536621094 seconds
    A         B          C          D          E          F          G  \
0  52  0.702325  Category3 2023-01-01  52.702325  36.520899  36.520899   
1  93  0.851969  Category3 2023-01-02  93.851969  79.233141  79.233141   
2  15  0.356609  Category1 2023-01-03  15.356609   5.349134   5.349134   
3  72  0.261397  Category3 2023-01-04  72.261397  18.820607  18.820607   
4  61  0.946599  Category3 2023-01-05  61.946599  57.742513  57.742513   

           H  
0  36.520899  
1  79.233141  
2   5.349134  
3  18.820607  
4  57.742513  


In [10]:
# Example 6: Use eval() with column-wise expression
start_time = time.time()
df['I'] = df.eval('A * B')
end_time = time.time()
print("\nExample 6:")
print(f"Time taken: {end_time - start_time} seconds")
print(df.head())


Example 6:
Time taken: 0.002666473388671875 seconds
    A         B          C          D          E          F          G  \
0  52  0.702325  Category3 2023-01-01  52.702325  36.520899  36.520899   
1  93  0.851969  Category3 2023-01-02  93.851969  79.233141  79.233141   
2  15  0.356609  Category1 2023-01-03  15.356609   5.349134   5.349134   
3  72  0.261397  Category3 2023-01-04  72.261397  18.820607  18.820607   
4  61  0.946599  Category3 2023-01-05  61.946599  57.742513  57.742513   

           H          I  
0  36.520899  36.520899  
1  79.233141  79.233141  
2   5.349134   5.349134  
3  18.820607  18.820607  
4  57.742513  57.742513  


In [11]:
# Example 7: Use numpy functions for complex calculations
start_time = time.time()
df['J'] = np.where(df['A'] > 50, df['A'] * df['B'], df['A'] / df['B'])
end_time = time.time()
print("\nExample 7:")
print(f"Time taken: {end_time - start_time} seconds")
print(df.head())


Example 7:
Time taken: 0.0013043880462646484 seconds
    A         B          C          D          E          F          G  \
0  52  0.702325  Category3 2023-01-01  52.702325  36.520899  36.520899   
1  93  0.851969  Category3 2023-01-02  93.851969  79.233141  79.233141   
2  15  0.356609  Category1 2023-01-03  15.356609   5.349134   5.349134   
3  72  0.261397  Category3 2023-01-04  72.261397  18.820607  18.820607   
4  61  0.946599  Category3 2023-01-05  61.946599  57.742513  57.742513   

           H          I          J  
0  36.520899  36.520899  36.520899  
1  79.233141  79.233141  79.233141  
2   5.349134   5.349134  42.062886  
3  18.820607  18.820607  18.820607  
4  57.742513  57.742513  57.742513  


In [12]:
# Example 8: Select columns using loc[] instead of chained indexing
start_time = time.time()
df['K'] = df.loc[df['A'] > 50, 'A'] * df.loc[df['A'] > 50, 'B']
end_time = time.time()
print("\nExample 8:")
print(f"Time taken: {end_time - start_time} seconds")
print(df.head())


Example 8:
Time taken: 0.002252340316772461 seconds
    A         B          C          D          E          F          G  \
0  52  0.702325  Category3 2023-01-01  52.702325  36.520899  36.520899   
1  93  0.851969  Category3 2023-01-02  93.851969  79.233141  79.233141   
2  15  0.356609  Category1 2023-01-03  15.356609   5.349134   5.349134   
3  72  0.261397  Category3 2023-01-04  72.261397  18.820607  18.820607   
4  61  0.946599  Category3 2023-01-05  61.946599  57.742513  57.742513   

           H          I          J          K  
0  36.520899  36.520899  36.520899  36.520899  
1  79.233141  79.233141  79.233141  79.233141  
2   5.349134   5.349134  42.062886        NaN  
3  18.820607  18.820607  18.820607  18.820607  
4  57.742513  57.742513  57.742513  57.742513  


In [13]:
# Example 9: Use DataFrame.query() for better performance
start_time = time.time()
df['L'] = df.query('A > 50')['A'] * df.query('A > 50')['B']
end_time = time.time()
print("\nExample 9:")
print(f"Time taken: {end_time - start_time} seconds")
print(df.head())


Example 9:
Time taken: 0.005530595779418945 seconds
    A         B          C          D          E          F          G  \
0  52  0.702325  Category3 2023-01-01  52.702325  36.520899  36.520899   
1  93  0.851969  Category3 2023-01-02  93.851969  79.233141  79.233141   
2  15  0.356609  Category1 2023-01-03  15.356609   5.349134   5.349134   
3  72  0.261397  Category3 2023-01-04  72.261397  18.820607  18.820607   
4  61  0.946599  Category3 2023-01-05  61.946599  57.742513  57.742513   

           H          I          J          K          L  
0  36.520899  36.520899  36.520899  36.520899  36.520899  
1  79.233141  79.233141  79.233141  79.233141  79.233141  
2   5.349134   5.349134  42.062886        NaN        NaN  
3  18.820607  18.820607  18.820607  18.820607  18.820607  
4  57.742513  57.742513  57.742513  57.742513  57.742513  


In [14]:
# Example 10: Sort DataFrame for faster merging
start_time = time.time()
df1 = df.sample(frac=0.5).sort_values(by='A')
df2 = df.sample(frac=0.5).sort_values(by='A')
merged_df = pd.merge(df1, df2, on='A')
end_time = time.time()
print("\nExample 10:")
print(f"Time taken: {end_time - start_time} seconds")
print(merged_df.head())


Example 10:
Time taken: 0.04434347152709961 seconds
   A       B_x        C_x        D_x       E_x       F_x       G_x       H_x  \
0  1  0.781933  Category2 2023-05-25  1.781933  0.781933  0.781933  0.781933   
1  1  0.781933  Category2 2023-05-25  1.781933  0.781933  0.781933  0.781933   
2  1  0.781933  Category2 2023-05-25  1.781933  0.781933  0.781933  0.781933   
3  1  0.781933  Category2 2023-05-25  1.781933  0.781933  0.781933  0.781933   
4  1  0.781933  Category2 2023-05-25  1.781933  0.781933  0.781933  0.781933   

        I_x       J_x  ...        C_y        D_y       E_y       F_y  \
0  0.781933  1.278883  ...  Category1 2025-08-03  1.089520  0.089520   
1  0.781933  1.278883  ...  Category3 2029-07-28  1.723609  0.723609   
2  0.781933  1.278883  ...  Category3 2026-08-18  1.391968  0.391968   
3  0.781933  1.278883  ...  Category3 2026-07-29  1.338750  0.338750   
4  0.781933  1.278883  ...  Category3 2041-02-17  1.099019  0.099019   

        G_y       H_y       I_y  

In [15]:
# Example 11: Use DataFrame.merge() with sorted DataFrames
start_time = time.time()
df1 = df.sample(frac=0.5).sort_values(by='A')
df2 = df.sample(frac=0.5).sort_values(by='A')
merged_df = df1.merge(df2, on='A')
end_time = time.time()
print("\nExample 11:")
print(f"Time taken: {end_time - start_time} seconds")
print(merged_df.head())


Example 11:
Time taken: 0.03684854507446289 seconds
   A      B_x        C_x        D_x      E_x      F_x      G_x      H_x  \
0  1  0.26936  Category2 2026-08-28  1.26936  0.26936  0.26936  0.26936   
1  1  0.26936  Category2 2026-08-28  1.26936  0.26936  0.26936  0.26936   
2  1  0.26936  Category2 2026-08-28  1.26936  0.26936  0.26936  0.26936   
3  1  0.26936  Category2 2026-08-28  1.26936  0.26936  0.26936  0.26936   
4  1  0.26936  Category2 2026-08-28  1.26936  0.26936  0.26936  0.26936   

       I_x       J_x  ...        C_y        D_y       E_y       F_y       G_y  \
0  0.26936  3.712508  ...  Category2 2043-06-26  1.022902  0.022902  0.022902   
1  0.26936  3.712508  ...  Category1 2043-05-16  1.450944  0.450944  0.450944   
2  0.26936  3.712508  ...  Category2 2026-01-09  1.733153  0.733153  0.733153   
3  0.26936  3.712508  ...  Category1 2023-07-10  1.585561  0.585561  0.585561   
4  0.26936  3.712508  ...  Category2 2038-12-08  1.213896  0.213896  0.213896   

        H

In [16]:
# Example 12: Set DataFrame index for faster lookups
df_indexed = df.set_index('A')
start_time = time.time()
result = df_indexed.loc[75, 'B']
end_time = time.time()
print("\nExample 12:")
print(f"Time taken: {end_time - start_time} seconds")
print(result)


Example 12:
Time taken: 0.0010249614715576172 seconds
A
75    0.796452
75    0.547063
75    0.430941
75    0.406406
75    0.287241
        ...   
75    0.457294
75    0.414650
75    0.581860
75    0.313325
75    0.310237
Name: B, Length: 104, dtype: float64


In [17]:
# Example 13: Use DataFrame.loc for label-based indexing
start_time = time.time()
result = df.loc[df['A'] == 75, 'B'].values[0]
end_time = time.time()
print("\nExample 13:")
print(f"Time taken: {end_time - start_time} seconds")
print(result)



Example 13:
Time taken: 0.0 seconds
0.7964517695378094


In [18]:
# Example 14: Use DataFrame.at for single value assignment
start_time = time.time()
df.at[0, 'A'] = 99
end_time = time.time()
print("\nExample 14:")
print(f"Time taken: {end_time - start_time} seconds")
print(df.head())


Example 14:
Time taken: 0.0 seconds
    A         B          C          D          E          F          G  \
0  99  0.702325  Category3 2023-01-01  52.702325  36.520899  36.520899   
1  93  0.851969  Category3 2023-01-02  93.851969  79.233141  79.233141   
2  15  0.356609  Category1 2023-01-03  15.356609   5.349134   5.349134   
3  72  0.261397  Category3 2023-01-04  72.261397  18.820607  18.820607   
4  61  0.946599  Category3 2023-01-05  61.946599  57.742513  57.742513   

           H          I          J          K          L  
0  36.520899  36.520899  36.520899  36.520899  36.520899  
1  79.233141  79.233141  79.233141  79.233141  79.233141  
2   5.349134   5.349134  42.062886        NaN        NaN  
3  18.820607  18.820607  18.820607  18.820607  18.820607  
4  57.742513  57.742513  57.742513  57.742513  57.742513  


In [19]:
# Example 15: Use DataFrame.iat for single value access
start_time = time.time()
result = df.iat[0, 0]
end_time = time.time()
print("\nExample 15:")
print(f"Time taken: {end_time - start_time} seconds")
print(result)


Example 15:
Time taken: 0.0 seconds
99


In [20]:
# Example 16: Avoid using append() in a loop
start_time = time.time()
dfs = [df.sample(frac=0.1) for _ in range(10)]
concat_df = pd.concat(dfs)
end_time = time.time()
print("\nExample 16:")
print(f"Time taken: {end_time - start_time} seconds")
print(concat_df.head())


Example 16:
Time taken: 0.006987571716308594 seconds
       A         B          C          D          E          F          G  \
3258  60  0.779068  Category1 2031-12-03  60.779068  46.744051  46.744051   
4575  93  0.968431  Category3 2035-07-12  93.968431  90.064109  90.064109   
490   61  0.887542  Category1 2024-05-05  61.887542  54.140056  54.140056   
6531  42  0.155313  Category3 2040-11-18  42.155313   6.523147   6.523147   
2087  31  0.357470  Category3 2028-09-18  31.357470  11.081559  11.081559   

              H          I           J          K          L  
3258  46.744051  46.744051   46.744051  46.744051  46.744051  
4575  90.064109  90.064109   90.064109  90.064109  90.064109  
490   54.140056  54.140056   54.140056  54.140056  54.140056  
6531   6.523147   6.523147  270.421630        NaN        NaN  
2087  11.081559  11.081559   86.720650        NaN        NaN  


In [21]:
# Example 17: Use DataFrame constructor with a list of DataFrames
start_time = time.time()
dfs = [df.sample(frac=0.1) for _ in range(10)]
concat_df = pd.DataFrame(pd.concat(dfs))
end_time = time.time()
print("\nExample 17:")
print(f"Time taken: {end_time - start_time} seconds")
print(concat_df.head())


Example 17:
Time taken: 0.006115913391113281 seconds
       A         B          C          D          E          F          G  \
2836  78  0.791197  Category1 2030-10-07  78.791197  61.713379  61.713379   
2526  31  0.642320  Category1 2029-12-01  31.642320  19.911909  19.911909   
4408  78  0.990266  Category1 2035-01-26  78.990266  77.240720  77.240720   
3972  34  0.388656  Category1 2033-11-16  34.388656  13.214304  13.214304   
9208  80  0.650773  Category1 2048-03-18  80.650773  52.061871  52.061871   

              H          I          J          K          L  
2836  61.713379  61.713379  61.713379  61.713379  61.713379  
2526  19.911909  19.911909  48.262574        NaN        NaN  
4408  77.240720  77.240720  77.240720  77.240720  77.240720  
3972  13.214304  13.214304  87.480960        NaN        NaN  
9208  52.061871  52.061871  52.061871  52.061871  52.061871  


In [22]:
# Example 18: Avoid using inplace=True with DataFrame methods
start_time = time.time()
df_copy = df.copy()
df_copy['M'] = df_copy['A'] * df_copy['B']
end_time = time.time()
print("\nExample 18:")
print(f"Time taken: {end_time - start_time} seconds")
print(df_copy.head())


Example 18:
Time taken: 0.001247406005859375 seconds
    A         B          C          D          E          F          G  \
0  99  0.702325  Category3 2023-01-01  52.702325  36.520899  36.520899   
1  93  0.851969  Category3 2023-01-02  93.851969  79.233141  79.233141   
2  15  0.356609  Category1 2023-01-03  15.356609   5.349134   5.349134   
3  72  0.261397  Category3 2023-01-04  72.261397  18.820607  18.820607   
4  61  0.946599  Category3 2023-01-05  61.946599  57.742513  57.742513   

           H          I          J          K          L          M  
0  36.520899  36.520899  36.520899  36.520899  36.520899  69.530173  
1  79.233141  79.233141  79.233141  79.233141  79.233141  79.233141  
2   5.349134   5.349134  42.062886        NaN        NaN   5.349134  
3  18.820607  18.820607  18.820607  18.820607  18.820607  18.820607  
4  57.742513  57.742513  57.742513  57.742513  57.742513  57.742513  


In [23]:
# Example 19: Use a dictionary to replace values in a column
start_time = time.time()
replace_dict = {1: 'One', 2: 'Two', 3: 'Three'}
df['C'] = df['C'].replace(replace_dict)
end_time = time.time()
print("\nExample 19:")
print(f"Time taken: {end_time - start_time} seconds")
print(df.head())


Example 19:
Time taken: 0.0020051002502441406 seconds
    A         B          C          D          E          F          G  \
0  99  0.702325  Category3 2023-01-01  52.702325  36.520899  36.520899   
1  93  0.851969  Category3 2023-01-02  93.851969  79.233141  79.233141   
2  15  0.356609  Category1 2023-01-03  15.356609   5.349134   5.349134   
3  72  0.261397  Category3 2023-01-04  72.261397  18.820607  18.820607   
4  61  0.946599  Category3 2023-01-05  61.946599  57.742513  57.742513   

           H          I          J          K          L  
0  36.520899  36.520899  36.520899  36.520899  36.520899  
1  79.233141  79.233141  79.233141  79.233141  79.233141  
2   5.349134   5.349134  42.062886        NaN        NaN  
3  18.820607  18.820607  18.820607  18.820607  18.820607  
4  57.742513  57.742513  57.742513  57.742513  57.742513  


In [24]:
# Example 20: Avoid using loc for element-wise replacements
start_time = time.time()
df.loc[df['A'] % 2 == 0, 'C'] = 'Even'
end_time = time.time()
print("\nExample 20:")
print(f"Time taken: {end_time - start_time} seconds")
print(df.head())


Example 20:
Time taken: 0.0015075206756591797 seconds
    A         B          C          D          E          F          G  \
0  99  0.702325  Category3 2023-01-01  52.702325  36.520899  36.520899   
1  93  0.851969  Category3 2023-01-02  93.851969  79.233141  79.233141   
2  15  0.356609  Category1 2023-01-03  15.356609   5.349134   5.349134   
3  72  0.261397       Even 2023-01-04  72.261397  18.820607  18.820607   
4  61  0.946599  Category3 2023-01-05  61.946599  57.742513  57.742513   

           H          I          J          K          L  
0  36.520899  36.520899  36.520899  36.520899  36.520899  
1  79.233141  79.233141  79.233141  79.233141  79.233141  
2   5.349134   5.349134  42.062886        NaN        NaN  
3  18.820607  18.820607  18.820607  18.820607  18.820607  
4  57.742513  57.742513  57.742513  57.742513  57.742513  
