# EX05
## read the fines.csv that you saved in the previous exercise

In [42]:
import pandas as pd
import gc
import timeit

# Загрузка данных
df = pd.read_csv('..\ex04\\fines.csv')
df.head()

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.0,Ford,Focus,1989
1,E432XX77RUS,1,6500.0,Toyota,Camry,1995
2,7184TT36RUS,1,2100.0,Ford,Focus,1984
3,X582HE161RUS,2,2000.0,Ford,Focus,2015
4,92918M178RUS,1,5700.0,Ford,Focus,2014


## iterations: in all the following subtasks, you need to calculate fines/refund*year for each row and create a new column with the calculated data and measure the time using the magic command %%timeit in the cell

In [43]:
%%timeit
# 1. Через for + iloc + append
result = []
for i in range(len(df)):
    val = df.iloc[i]['Fines'] / df.iloc[i]['Refund'] * df.iloc[i]['Year']
    result.append(val)
df['loop_calc'] = result

179 ms ± 4.59 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [44]:
%%timeit
# 2. Через iterrows
result = []
for idx, row in df.iterrows():
    val = row['Fines'] / row['Refund'] * row['Year']
    result.append(val)
df['iterrows_calc'] = result

60.4 ms ± 1.21 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [45]:
%%timeit
#  3. Через apply и lambda
df['apply_calc'] = df.apply(lambda row: row['Fines'] / row['Refund'] * row['Year'], axis=1)

12.9 ms ± 402 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [46]:
%%timeit
# 4. Через Series напрямую
df['series_calc'] = df['Fines'] / df['Refund'] * df['Year']

345 µs ± 22.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [47]:
%%timeit
# 5. Через .values (NumPy)
df['values_calc'] = (df['Fines'].values / df['Refund'].values) * df['Year'].values

130 µs ± 2.17 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## indexing: measure the time using the magic command %%timeit in the cell

In [48]:
car_number = 'O136HO197RUS'

In [49]:
%%timeit
# Получение строки без индекса
df[df['CarNumber'] == car_number]

407 µs ± 7.74 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [50]:
# Установка индекса
df_indexed = df.set_index('CarNumber')

In [51]:
%%timeit
# Получение строки с индексом
df_indexed.loc[car_number]

198 µs ± 2.98 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## downcasting

In [52]:
# Проверка исходной памяти
df.info(memory_usage='deep')

# Копия датафрейма
df_optimized = df.copy()

# Downcast чисел
for col in df_optimized.select_dtypes(include='float64'):
    df_optimized[col] = pd.to_numeric(df_optimized[col], downcast='float')

for col in df_optimized.select_dtypes(include='int64'):
    df_optimized[col] = pd.to_numeric(df_optimized[col], downcast='integer')

# Проверка новой памяти
df_optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   CarNumber      930 non-null    object 
 1   Refund         930 non-null    int64  
 2   Fines          930 non-null    float64
 3   Make           930 non-null    object 
 4   Model          919 non-null    object 
 5   Year           930 non-null    int64  
 6   loop_calc      930 non-null    float64
 7   iterrows_calc  930 non-null    float64
 8   apply_calc     930 non-null    float64
 9   series_calc    930 non-null    float64
 10  values_calc    930 non-null    float64
dtypes: float64(6), int64(2), object(3)
memory usage: 232.9 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   CarNumber      930 non-null    object 
 1   Refund         930

## categories

In [53]:
for col in df_optimized.select_dtypes(include='object'):
    df_optimized[col] = df_optimized[col].astype('category')

# Снова проверим память
df_optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   CarNumber      930 non-null    category
 1   Refund         930 non-null    int16   
 2   Fines          930 non-null    float32 
 3   Make           930 non-null    category
 4   Model          919 non-null    category
 5   Year           930 non-null    int16   
 6   loop_calc      930 non-null    float64 
 7   iterrows_calc  930 non-null    float64 
 8   apply_calc     930 non-null    float64 
 9   series_calc    930 non-null    float64 
 10  values_calc    930 non-null    float64 
dtypes: category(3), float32(1), float64(5), int16(2)
memory usage: 101.3 KB


## memory clean

In [55]:
# Удалим исходный df
%reset_selective -f df
gc.collect()

509