# Enhancing performance with Python

## Libraries and settings

In [1]:
# Libraries
import os
import csv
import random
import numpy as np
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Show current working directory
print(os.getcwd())

/Users/sivanujanselvarajah/Documents/zhaw/4.Semester/Scientific programming/scientific_programming/Week_10/exercises


## Create data set with simulated apartment data

In [2]:
def apmt_sim(n_records):
    """Simulates an apartment data set"""

    rooms = np.random.randint(1, 8, size=n_records).astype(int)
    area = (rooms * np.random.randint(25, 45, size=n_records)).astype(int)
    price = (area * np.random.randint(25, 35, size=n_records)).astype(int)

    df = pd.DataFrame(
        {
            "id": list(range(1, n_records+1, 1)),
            "price": price,
            "area": area,
            "rooms": rooms
        }
    )

    return df

# Create data frame with 100'000 records
df = apmt_sim(n_records=10**5)

# Save data to file
df.to_csv('apartment_data_simulated.csv', sep=';', encoding='utf-8')

# Statistics to check values 
print(df.iloc[:,:].describe().round(2))

# Show data frame
df

              id      price       area     rooms
count  100000.00  100000.00  100000.00  100000.0
mean    50000.50    4067.06     137.92       4.0
std     28867.66    2211.97      73.46       2.0
min         1.00     625.00      25.00       1.0
25%     25000.75    2232.00      78.00       2.0
50%     50000.50    3915.00     132.00       4.0
75%     75000.25    5670.00     192.00       6.0
max    100000.00   10472.00     308.00       7.0


Unnamed: 0,id,price,area,rooms
0,1,2706,82,2
1,2,6708,258,6
2,3,1850,74,2
3,4,4080,136,4
4,5,3500,125,5
...,...,...,...,...
99995,99996,5676,172,4
99996,99997,4785,165,5
99997,99998,8064,252,6
99998,99999,1287,39,1


## Function to calculate the price per m2

In [3]:
# Function
def ppm2(price, area):
    """Calculates the price per m2"""
    
    return price / area

## Compare the performance
Note that, in the code below, the magic command <b style="color:blue">%%timeit</b> measures the mean execution time of a cell (-r 1 means 1 run)

### Using a for loop

In [15]:
%%timeit -r 1

# Foor loop with df.iterrows()
price_m2 = []
for index, row in df.iterrows():
    price_m2.append(ppm2(row['price'], row['area']))

# Write new variable to df
df['price_per_m2'] = price_m2

1.89 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### Using column division

In [5]:
%%timeit -r 1

df['price_per_m2'] = df['price'] / df['area']

197 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 1,000 loops each)


### Using .apply() and lambda

In [6]:
%%timeit -r 1

df['price_per_m2'] = df.apply(lambda row: ppm2(row['price'], row['area']), axis=1)

507 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### Using .map() and lambda

In [7]:
%%timeit -r 1

df['price_per_m2'] = list(map(ppm2, df['price'], df['area']))

21.7 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)


## Enhancing performance when reading and writing data from/to a file

### Reading data frame from a file using pd.read_csv()

In [8]:
%%timeit -r 1

df_in = pd.read_csv('apartment_data_simulated.csv', sep=';', encoding='utf-8')

22.1 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)


### Reading data frame from a file using open() and csv.reader()

In [9]:
%%timeit -r 1

# Open the file for reading
with open("apartment_data_simulated.csv", 'r') as file:
    csv_reader = csv.reader(file, delimiter=";")

    # This requires that your process rows as you produce them, e.g.:
    #line_count = 0
    #for row in csv_reader:
    #    if line_count == 0:
    #        print(f'Column names are {", ".join(row)}')
    #        line_count += 1
    #    line_count += 1
    #print(f'Number of lines processed: {line_count}.')


19.5 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 100,000 loops each)


### Writing the data frame to a .csv file using pd.to_csv()

In [10]:
%%timeit -r 1

df.to_csv('apartment_data_exported.csv', sep=';', encoding='utf-8')

138 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)


### Writing data to a .csv file using open() and csv.writer()

In [11]:
# Convert data frame to list
lst = df.values.tolist()
lst[:5]

[[1.0, 2706.0, 82.0, 2.0, 33.0],
 [2.0, 6708.0, 258.0, 6.0, 26.0],
 [3.0, 1850.0, 74.0, 2.0, 25.0],
 [4.0, 4080.0, 136.0, 4.0, 30.0],
 [5.0, 3500.0, 125.0, 5.0, 28.0]]

In [12]:
%%timeit -r 1

# Column names
column_names = ['id', 'price', 'area', 'rooms', 'price_per_m2']

# Open a file for writing
with open('apartment_data_exported.csv', 'w', newline='') as file:

    # Create a CSV writer object with a semicolon delimiter
    writer = csv.writer(file, delimiter=';')

    # Write the column names as the first row in the CSV file
    writer.writerow(column_names)

    # Write each sub-list as a row to the CSV file
    for row in lst:
        writer.writerow(row)

121 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)


### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [13]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Darwin | 23.3.0
Datetime: 2024-04-24 16:43:32
Python Version: 3.10.13
-----------------------------------
