# Performance Optimization
### 1. Optimizing Memory Usage (dtype selection, category type)
Choosing appropriate data types and converting columns to category type can save memory.


In [1]:
import pandas as pd
import numpy as np

In [2]:
# Creating a DataFrame
df_memory = pd.DataFrame({
    'id': range(1, 10001),
    'name': np.random.choice(['Alice', 'Bob', 'Charlie'], size=10000),
    'age': np.random.randint(18, 65, size=10000)
})

# Checking initial memory usage
print("Initial memory usage:")
print(df_memory.memory_usage(deep=True))

# Optimizing memory by using categorical dtype
df_memory['name'] = df_memory['name'].astype('category')

print("\nOptimized memory usage:")
print(df_memory.memory_usage(deep=True))

Initial memory usage:
Index       132
id        80000
name     539602
age       40000
dtype: int64

Optimized memory usage:
Index      132
id       80000
name     10270
age      40000
dtype: int64


### 2. Vectorization vs. Loops
Vectorization allows applying operations over entire arrays, leading to performance gains over traditional loops.


In [3]:
import time

# Using a loop to add 10 to each value
df_vector = pd.DataFrame({'values': range(1, 10001)})

# Loop-based approach
start_time = time.time()
df_vector['values_plus_10_loop'] = [x + 10 for x in df_vector['values']]
end_time = time.time()
print(f"Loop execution time: {end_time - start_time} seconds")

# Using vectorization to achieve the same result
start_time = time.time()
df_vector['values_plus_10_vector'] = df_vector['values'] + 10
end_time = time.time()
print(f"Vectorized execution time: {end_time - start_time} seconds")

Loop execution time: 0.007524013519287109 seconds
Vectorized execution time: 0.0 seconds


### 3. Efficient Iteration with iterrows() vs. itertuples()
`itertuples()` is generally faster than `iterrows()` when iterating over rows in a DataFrame.

In [4]:
# Sample DataFrame
df_iteration = pd.DataFrame({
    'A': np.random.randint(1, 100, size=1000),
    'B': np.random.randint(1, 100, size=1000)
})

# Iteration using iterrows()
start_time = time.time()
for index, row in df_iteration.iterrows():
    _ = row['A'] + row['B']
end_time = time.time()
print(f"Iterrows execution time: {end_time - start_time} seconds")

# Iteration using itertuples()
start_time = time.time()
for row in df_iteration.itertuples():
    _ = row.A + row.B
end_time = time.time()
print(f"Itetuples execution time: {end_time - start_time} seconds")

Iterrows execution time: 0.0274050235748291 seconds
Itetuples execution time: 0.0 seconds


### 4. Parallel Processing with Pandas
Using parallel processing to speed up data operations.


In [5]:
from pandarallel import pandarallel

# Initializing pandarallel
pandarallel.initialize()

# Sample DataFrame for parallel processing
df_parallel = pd.DataFrame({
    'A': np.random.randint(1, 100, size=10000)
})

# Function to apply
def compute_square(x):
    return x ** 2

# Using parallel apply
df_parallel['A_squared'] = df_parallel['A'].parallel_apply(compute_square)
df_parallel.head()

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


Unnamed: 0,A,A_squared
0,33,1089
1,63,3969
2,57,3249
3,60,3600
4,43,1849


### 5. Optimizing File I/O (chunksize, compression)
Reading and writing large files in chunks and with compression can optimize file operations.

In [6]:
# Writing a large CSV file
df_large = pd.DataFrame(np.random.randn(100000, 5), columns=list('ABCDE'))
df_large.to_csv('data/local/large_file.csv', index=False)

# Reading in chunks
chunk_size = 10000
chunks = pd.read_csv('data/local/large_file.csv', chunksize=chunk_size)

# Processing each chunk
for chunk in chunks:
    print(chunk.head(1))  # Process each chunk as needed

# Writing with compression
df_large.to_csv('data/local/large_file_compressed.csv.gz', index=False, compression='gzip')

          A         B         C         D         E
0 -0.722498  0.538341 -0.095738  1.398888  0.062597
              A         B         C         D         E
10000  0.752074 -0.802763  1.104975 -1.570371 -0.212752
              A         B         C         D         E
20000 -0.676919  0.460245 -2.231921  0.553191  0.256725
              A         B        C         D         E
30000 -1.056872 -0.927621 -3.36556 -1.283336  0.587232
              A         B         C         D         E
40000  0.457738 -1.858737  1.298233  0.475711  0.624194
             A        B         C         D         E
50000 -1.20249  0.42562 -0.497545  0.651716 -3.196837
              A       B         C         D         E
60000 -0.627103 -0.3621  1.088399 -0.946004  0.825941
              A         B         C         D        E
70000  0.541732 -0.972174 -0.559216  1.085184 -0.98544
              A         B         C         D         E
80000  0.244615  0.029862  0.968331  0.536014 -0.202492
            

### 6. Working with Large Datasets in Pandas
Strategies for handling large datasets include using `chunksize`, `category` dtype, and filtering data early.

In [7]:
# Efficient data filtering before loading entire DataFrame
df_filtered = pd.read_csv('data/local/large_file.csv', usecols=['A', 'B'], nrows=50000)
df_filtered.head()

Unnamed: 0,A,B
0,-0.722498,0.538341
1,0.770041,-0.102221
2,-1.999852,1.081321
3,-0.112298,-0.921257
4,-0.845955,0.351747


### 7. Using dask for Scalable DataFrames
`dask` is a parallel computing library that can handle larger-than-memory DataFrames.


In [8]:
import dask.dataframe as dd

# Converting a Pandas DataFrame to a Dask DataFrame
ddf = dd.from_pandas(df_large, npartitions=10)

# Performing operations in parallel with Dask
ddf['sum'] = ddf['A'] + ddf['B']
ddf.compute().head()

Unnamed: 0,A,B,C,D,E,sum
0,-0.722498,0.538341,-0.095738,1.398888,0.062597,-0.184157
1,0.770041,-0.102221,1.012833,0.002186,-0.727072,0.66782
2,-1.999852,1.081321,0.544163,-0.058429,-0.349761,-0.91853
3,-0.112298,-0.921257,-1.053475,-0.882685,-1.336251,-1.033556
4,-0.845955,0.351747,-0.728274,-0.562573,-1.311634,-0.494207


### 8. Speeding Up with NumPy Operations
Using NumPy arrays directly for numerical computations can be faster than Pandas operations.

In [9]:
# Using NumPy for mathematical operations
values = np.random.rand(1000000)

# Calculating square roots using Pandas
start_time = time.time()
df_values = pd.DataFrame(values, columns=['Values'])
df_values['sqrt'] = df_values['Values'] ** 0.5
end_time = time.time()
print(f"Pandas execution time: {end_time - start_time} seconds")

# Calculating square roots using NumPy
start_time = time.time()
sqrt_values = np.sqrt(values)
end_time = time.time()
print(f"NumPy execution time: {end_time - start_time} seconds")

Pandas execution time: 0.011649608612060547 seconds
NumPy execution time: 0.01202249526977539 seconds
