In [48]:
import pandas as pd
import numpy as np
import requests

# Exercise 04. Enrichment and transformations

## Task 1. Read the JSON file that you saved in ex02

In [49]:
df = pd.read_json('../data/auto.json', orient='records')
pd.options.display.float_format = '{:.2f}'.format
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
720,Y163O8161RUS,2,1600.00,Ford,Focus
721,M0309X197RUS,1,22300.00,Ford,Focus
722,O673E8197RUS,2,600.00,Ford,Focus
723,8610T8154RUS,1,2000.00,Ford,Focus


## Task 2. Enrich the dataframe using a sample from that dataframe

In [50]:
new_df = df.sample(n=200, replace=True, random_state=21)
new_df['Refund'] = df['Refund'].sample(n=200, replace=True, random_state=21)
new_df['Fines'] = df['Fines'].sample(n=200, replace=True, random_state=21)
concat_rows = pd.concat([df, new_df], ignore_index=True)
concat_rows.count()

CarNumber    925
Refund       925
Fines        925
Make         925
Model        914
dtype: int64

## Task 3. Enrich the concat_rows dataframe with a new column containing generated data

In [51]:
np.random.seed(42)
years = pd.Series(data=np.random.randint(1980, 2019, size=len(concat_rows)), name='Year')
fines = concat_rows.join(years)
fines.count()

CarNumber    925
Refund       925
Fines        925
Make         925
Model        914
Year         925
dtype: int64

## Task 4. Enrich the dataframe with data from another dataframe

In [52]:
surnames = pd.read_json('../../datasets/surname.json')
surnames.columns = surnames.iloc[0]
surnames = surnames[1:].reset_index(drop=True)
surnames['COUNT'] = pd.to_numeric(surnames['COUNT'], errors='coerce')
surnames['RANK'] = pd.to_numeric(surnames['RANK'], errors='coerce')
surnames

Unnamed: 0,NAME,COUNT,RANK
0,ADAMS,427865,42
1,ALLEN,482607,33
2,ALVAREZ,233983,92
3,ANDERSON,784404,15
4,BAILEY,277845,72
...,...,...,...
95,WILLIAMS,1625252,3
96,WILSON,801882,14
97,WOOD,250715,84
98,WRIGHT,458980,35


In [53]:
unique_car_numbers = fines['CarNumber'].unique()
np.random.seed(21)
owners = pd.DataFrame({
    'CarNumber': unique_car_numbers,
    'SURNAME': np.random.choice(surnames['NAME'], 
                               size=len(unique_car_numbers), 
                               replace=True)
})
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,RICHARDSON
1,E432XX77RUS,ROSS
2,7184TT36RUS,MORGAN
3,X582HE161RUS,BAILEY
4,92918M178RUS,LOPEZ
...,...,...
526,O136HO197RUS,CAMPBELL
527,O22097197RUS,HALL
528,M0309X197RUS,BAKER
529,O673E8197RUS,DIAZ


In [54]:
new_fines = pd.DataFrame({
    'CarNumber': ['2727271RUS', '2627271RUS', '2527271RUS', '2427271RUS', '2327271RUS'],
    'Refund': [2.0, 2.0, 1.0, 2.0, 1.0],
    'Fines': [8500.0, 1800.0, 3200.0, 1500.0, 1300.0],
    'Make': ['Lada', 'Kia', 'Hyundai', 'Skoda', 'Renault'],
    'Model': ['Vesta', 'Rio', 'Solaris', 'Octavia', 'Logan'],
    'Year': [2015, 2018, 2019, 2016, 2017]
})

fines = pd.concat([fines, new_fines], ignore_index=True)
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,2018
1,E432XX77RUS,1.00,6500.00,Toyota,Camry,2008
2,7184TT36RUS,1.00,2100.00,Ford,Focus,1994
3,X582HE161RUS,2.00,2000.00,Ford,Focus,1987
4,92918M178RUS,1.00,5700.00,Ford,Focus,2000
...,...,...,...,...,...,...
925,2727271RUS,2.00,8500.00,Lada,Vesta,2015
926,2627271RUS,2.00,1800.00,Kia,Rio,2018
927,2527271RUS,1.00,3200.00,Hyundai,Solaris,2019
928,2427271RUS,2.00,1500.00,Skoda,Octavia,2016


In [None]:
owners = owners.iloc[:-20]

new_owners = pd.DataFrame({
    'CarNumber: ['66666666RUS', '00MEOW00RUS', '123O505RUS'],
    'SURNAME': ['Surname1', 'Surname2', 'Surname3']
})

owners = pd.concat([owners, new_owners], ignore_index=True)
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,RICHARDSON
1,E432XX77RUS,ROSS
2,7184TT36RUS,MORGAN
3,X582HE161RUS,BAILEY
4,92918M178RUS,LOPEZ
...,...,...
509,O50197197RUS,WRIGHT
510,7608EE777RUS,HILL
511,66666666RUS,Surname1
512,00MEOW00RUS,Surname2


In [56]:
pd.merge(fines, owners, on='CarNumber', how='inner')

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,2018,RICHARDSON
1,E432XX77RUS,1.00,6500.00,Toyota,Camry,2008,ROSS
2,7184TT36RUS,1.00,2100.00,Ford,Focus,1994,MORGAN
3,X582HE161RUS,2.00,2000.00,Ford,Focus,1987,BAILEY
4,92918M178RUS,1.00,5700.00,Ford,Focus,2000,LOPEZ
...,...,...,...,...,...,...,...
898,M942OT152RUS,1.00,2000.00,Ford,Focus,2016,ALVAREZ
899,Y187O8161RUS,2.00,400.00,Ford,Focus,2006,COOK
900,7064C8197RUS,1.00,12800.00,Volkswagen,Passat,2012,DAVIS
901,8437XX154RUS,2.00,800.00,Ford,Focus,1983,HALL


In [57]:
pd.merge(fines, owners, on='CarNumber', how='outer')

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,00MEOW00RUS,,,,,,Surname2
1,123O505RUS,,,,,,Surname3
2,2327271RUS,1.00,1300.00,Renault,Logan,2017.00,
3,2427271RUS,2.00,1500.00,Skoda,Octavia,2016.00,
4,2527271RUS,1.00,3200.00,Hyundai,Solaris,2019.00,
...,...,...,...,...,...,...,...
928,Y969O8197RUS,2.00,7800.00,Ford,Focus,2013.00,LOPEZ
929,Y973O8197RUS,2.00,8594.59,Ford,Focus,2014.00,YOUNG
930,Y973O8197RUS,1.00,34800.00,Ford,Focus,1981.00,YOUNG
931,Y973O8197RUS,1.00,69600.00,Ford,Focus,2008.00,YOUNG


In [58]:
pd.merge(fines, owners, on='CarNumber', how='left')

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,2018,RICHARDSON
1,E432XX77RUS,1.00,6500.00,Toyota,Camry,2008,ROSS
2,7184TT36RUS,1.00,2100.00,Ford,Focus,1994,MORGAN
3,X582HE161RUS,2.00,2000.00,Ford,Focus,1987,BAILEY
4,92918M178RUS,1.00,5700.00,Ford,Focus,2000,LOPEZ
...,...,...,...,...,...,...,...
925,2727271RUS,2.00,8500.00,Lada,Vesta,2015,
926,2627271RUS,2.00,1800.00,Kia,Rio,2018,
927,2527271RUS,1.00,3200.00,Hyundai,Solaris,2019,
928,2427271RUS,2.00,1500.00,Skoda,Octavia,2016,


In [59]:
pd.merge(fines, owners, on='CarNumber', how='right')

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,2018.00,RICHARDSON
1,Y163O8161RUS,2.00,1600.00,Ford,Focus,1997.00,RICHARDSON
2,Y163O8161RUS,2.00,3200.00,Ford,Focus,2010.00,RICHARDSON
3,Y163O8161RUS,2.00,3200.00,Ford,Focus,1985.00,RICHARDSON
4,Y163O8161RUS,2.00,1600.00,Ford,Focus,2009.00,RICHARDSON
...,...,...,...,...,...,...,...
901,O50197197RUS,2.00,7800.00,Ford,Focus,2009.00,WRIGHT
902,7608EE777RUS,1.00,4000.00,Skoda,Octavia,1996.00,HILL
903,66666666RUS,,,,,,Surname1
904,00MEOW00RUS,,,,,,Surname2


## Task 5. Create a pivot table from the fines dataframe

In [60]:
fines.pivot_table(index=['Make', 'Model'], columns='Year', values='Fines', aggfunc='sum')

Unnamed: 0_level_0,Year,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Make,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Ford,Focus,217789.17,134189.17,110189.17,82794.59,239100.0,200900.0,105900.0,116800.0,75994.59,183694.59,...,75794.59,247694.59,156989.17,114900.0,177483.76,133494.59,351794.59,156083.76,186794.59,
Ford,Mondeo,,,,,,,,,,,...,,34400.0,,,,8600.0,,,,
Hyundai,Solaris,,,,,,,,,,,...,,,,,,,,,,3200.0
Kia,Rio,,,,,,,,,,,...,,,,,,,,,1800.0,
Lada,Vesta,,,,,,,,,,,...,,,,,,8500.0,,,,
Renault,Logan,,,,,,,,,,,...,,,,,,,,1300.0,,
Skoda,Octavia,3200.0,,212600.0,5700.0,,10200.0,8500.0,1900.0,500.0,,...,,1000.0,13000.0,,21594.59,,8900.0,,82800.0,
Toyota,Camry,8594.59,,,6500.0,,1000.0,,,,,...,15000.0,,22400.0,,8594.59,,,20200.0,19800.0,
Toyota,Corolla,34300.0,14400.0,3400.0,,3200.0,24594.59,,,8000.0,,...,,22800.0,,4000.0,11300.0,,900.0,24000.0,,
Volkswagen,Golf,,1000.0,8594.59,9300.0,,500.0,,,,,...,,5800.0,10600.0,,18400.0,300.0,,,,


## Task 6. Save both the fines and owners dataframes to CSV files without an index.

In [61]:
owners.to_csv('../data/owners.csv', index=False)
fines.to_csv('../data/fines.csv', index=False)

In [62]:
fines.count()

CarNumber    930
Refund       930
Fines        930
Make         930
Model        919
Year         930
dtype: int64