# EX04
## read the JSON file that you saved in ex02

In [7]:
import pandas as pd  
import numpy as np  
import requests

pd.options.display.float_format = '{:.2f}'.format

In [16]:
# Загрузка исходного датасета
df = pd.read_json('../ex02/auto.json')

# Проверка структуры
print(df.head())
print(df.isna().sum())


      CarNumber  Refund   Fines    Make  Model
0  Y163O8161RUS       2 3200.00    Ford  Focus
1   E432XX77RUS       1 6500.00  Toyota  Camry
2   7184TT36RUS       1 2100.00    Ford  Focus
3  X582HE161RUS       2 2000.00    Ford  Focus
4  92918M178RUS       1 5700.00    Ford  Focus
CarNumber    0
Refund       0
Fines        0
Make         0
Model        9
dtype: int64


## enrich the dataframe using a sample from that dataframe

In [17]:
# Сэмплируем 200 строк из существующих
sample = df.sample(n=200, random_state=21)

# Объединяем с исходным датафреймом
concat_rows = pd.concat([df, sample], ignore_index=True)

## enrich the dataframe concat_rows by a new column with the data generated

In [18]:
np.random.seed(21)
years = np.random.randint(1980, 2020, size=len(concat_rows))  # 2020, т.к. верхняя граница не включается
concat_rows['Year'] = years

# Обновлённый датафрейм
fines = concat_rows.copy()

## enrich the dataframe with the data from another dataframe

In [22]:
import json

# Загрузка файла
with open('../../datasets/surname.json', encoding='utf-8') as f:
    raw_data = json.load(f)

# Преобразуем в DataFrame
surnames_data = pd.DataFrame(raw_data[1:], columns=raw_data[0])

# Извлекаем список фамилий
surname_list = surnames_data['NAME'].tolist()

# Удаляем спецсимволы, если они есть
clean_surnames = [s.replace(",", "").replace(".", "").replace("(", "").replace(")", "") for s in surname_list]

# Посмотрим первые 5 фамилий
print(clean_surnames[:5])

['ADAMS', 'ALLEN', 'ALVAREZ', 'ANDERSON', 'BAILEY']


In [None]:
unique_car_numbers = sample['CarNumber'].unique()

owners_surnames = pd.Series(
    np.random.RandomState(21).choice(clean_surnames, size=len(unique_car_numbers), replace=True)
)

owners = pd.DataFrame({
    'CarNumber': unique_car_numbers,
    'SURNAME': owners_surnames
})



In [25]:
additional_fines = pd.DataFrame({
    'CarNumber': ['NEW001', 'NEW002', 'NEW003', 'NEW004', 'NEW005'],
    'Make': ['Toyota', 'Ford', 'BMW', 'Audi', 'Honda'],
    'Model': ['Corolla', 'Focus', 'X5', 'A4', 'Civic'],
    'Refund': np.random.randint(100, 1000, size=5),
    'Fines': np.random.randint(10, 300, size=5),
    'Year': np.random.randint(1980, 2020, size=5)
})

# Добавим к существующему fines
fines = pd.concat([fines, additional_fines], ignore_index=True)

In [26]:
# Удаляем последние 20 строк
owners = owners.iloc[:-20]

# Добавляем 3 новых владельцев
additional_owners = pd.DataFrame({
    'CarNumber': ['EXTRA001', 'EXTRA002', 'EXTRA003'],
    'SURNAME': ['Walker', 'Barnes', 'Murphy']
})

owners = pd.concat([owners, additional_owners], ignore_index=True)


In [27]:
# Вариант 1: только car numbers, которые есть в обоих датафреймах (inner join)
df_inner = pd.merge(fines, owners, on='CarNumber', how='inner')

# Вариант 2: все car numbers, которые есть хотя бы в одном датафрейме (outer join)
df_outer = pd.merge(fines, owners, on='CarNumber', how='outer')

# Вариант 3: все car numbers из fines (left join)
df_left = pd.merge(fines, owners, on='CarNumber', how='left')

# Вариант 4: все car numbers из owners (right join)
df_right = pd.merge(fines, owners, on='CarNumber', how='right')

In [30]:
pivot_table = fines.pivot_table(
    values='Fines',
    index=['Make', 'Model'],
    columns='Year',
    aggfunc='sum',
)

display(pivot_table)


Unnamed: 0_level_0,Year,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Make,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Audi,A4,,206.0,,,,,,,,,...,,,,,,,,,,
BMW,X5,,,,,,,,,,,...,,,,,,,,,,
Ford,Focus,110294.59,408983.76,165883.76,64800.0,96989.17,162683.76,96589.17,125700.0,111789.17,176094.59,...,142678.35,103478.35,97100.0,139674.59,122678.35,209100.0,98089.17,263000.0,274089.17,78889.17
Ford,Mondeo,,,,,,,,,,8600.0,...,,,34400.0,,,,46200.0,,,
Honda,Civic,,,,,,,,214.0,,,...,,,,,,,,,,
Skoda,Octavia,2400.0,,7300.0,11594.59,,10294.59,600.0,5200.0,5200.0,91400.0,...,3100.0,500.0,500.0,12594.59,300.0,46394.59,300.0,8594.59,156200.0,9500.0
Toyota,Camry,12000.0,8594.59,,7200.0,,,,,,22400.0,...,,,8594.59,,1000.0,,,,21594.59,18100.0
Toyota,Corolla,,,2257.0,,,,,14900.0,,4000.0,...,24000.0,8594.59,30300.0,,,,900.0,9600.0,7600.0,
Volkswagen,Golf,30900.0,,,8594.59,300.0,24000.0,,44800.0,,5800.0,...,,300.0,,20800.0,,2300.0,,,1000.0,
Volkswagen,Jetta,,,,,,,,,,,...,,,,,,,,,,


## save both the fines and owners dataframes to CSV files without an index

In [31]:
fines.to_csv('fines.csv', index=False)
owners.to_csv('owners.csv', index=False)