In [228]:
import pandas as pd
import numpy as np
import requests

## read the JSON file that you saved in ex02

In [229]:
pd.set_option("display.float_format", '{:.2f}'.format)
data = pd.DataFrame
try:
    data = pd.read_json("../data/auto.json", orient='records')
    print(data)
except IOError as e:
    print(e)
    quit(1)

        CarNumber  Refund    Fines    Make    Model
0    Y163O8161RUS       2  3200.00    Ford    Focus
1     E432XX77RUS       1  6500.00  Toyota    Camry
2     7184TT36RUS       1  2100.00    Ford    Focus
3    X582HE161RUS       2  2000.00    Ford    Focus
4    92918M178RUS       1  5700.00    Ford    Focus
..            ...     ...      ...     ...      ...
720  Y163O8161RUS       2  1600.00    Ford    Focus
721  M0309X197RUS       1 22300.00    Ford    Focus
722  O673E8197RUS       2   600.00    Ford    Focus
723  8610T8154RUS       1  2000.00    Ford    Focus
724  H419XE197RUS       2  8594.59  Toyota  Corolla

[725 rows x 5 columns]


## enrich the dataframe using a sample from that dataframe

In [230]:
smpl = data.sample(n=200, random_state=21)
smpl['Fines'] = data['Fines'].sample(n=200, random_state=np.random.RandomState()).values
smpl['Refund'] = data['Refund'].sample(n=200, random_state=np.random.RandomState()).values
print(smpl)

        CarNumber  Refund    Fines        Make   Model
445  M0299X197RUS       1  3000.00        Ford   Focus
22   83298C154RUS       2   900.00        Ford   Focus
93   H957HY161RUS       1  3500.00        Ford   Focus
173   T941CC96RUS       2   800.00        Ford   Focus
697  H966HY161RUS       2  1000.00        Ford   Focus
..            ...     ...      ...         ...     ...
14   8182XX154RUS       2  7000.00        Ford   Focus
623   X796TH96RUS       2  7400.00        Ford   Focus
498  T011MY163RUS       2 17000.00        Ford   Focus
536   T341CC96RUS       2  2200.00  Volkswagen  Passat
520   T119CT96RUS       2  4000.00        Ford   Focus

[200 rows x 5 columns]


In [231]:
concat_rows = pd.concat([data, smpl])
concat_rows.reset_index(drop=True, inplace=True)
print(concat_rows)

        CarNumber  Refund    Fines        Make   Model
0    Y163O8161RUS       2  3200.00        Ford   Focus
1     E432XX77RUS       1  6500.00      Toyota   Camry
2     7184TT36RUS       1  2100.00        Ford   Focus
3    X582HE161RUS       2  2000.00        Ford   Focus
4    92918M178RUS       1  5700.00        Ford   Focus
..            ...     ...      ...         ...     ...
920  8182XX154RUS       2  7000.00        Ford   Focus
921   X796TH96RUS       2  7400.00        Ford   Focus
922  T011MY163RUS       2 17000.00        Ford   Focus
923   T341CC96RUS       2  2200.00  Volkswagen  Passat
924   T119CT96RUS       2  4000.00        Ford   Focus

[925 rows x 5 columns]


## enrich the dataframe concat_rows by a new column with the data generated

In [232]:
np.random.seed(21)
s = pd.Series(np.random.randint(1980, 2020, (len(concat_rows))), dtype='int', name='Year')
fines = pd.concat([concat_rows,s],axis=1)
print(fines)

        CarNumber  Refund    Fines        Make   Model  Year
0    Y163O8161RUS       2  3200.00        Ford   Focus  1989
1     E432XX77RUS       1  6500.00      Toyota   Camry  1995
2     7184TT36RUS       1  2100.00        Ford   Focus  1984
3    X582HE161RUS       2  2000.00        Ford   Focus  2015
4    92918M178RUS       1  5700.00        Ford   Focus  2014
..            ...     ...      ...         ...     ...   ...
920  8182XX154RUS       2  7000.00        Ford   Focus  1981
921   X796TH96RUS       2  7400.00        Ford   Focus  1992
922  T011MY163RUS       2 17000.00        Ford   Focus  2007
923   T341CC96RUS       2  2200.00  Volkswagen  Passat  2005
924   T119CT96RUS       2  4000.00        Ford   Focus  1997

[925 rows x 6 columns]


## enrich the dataframe with the data from another dataframe

In [233]:
url = "https://projects.intra.42.fr/uploads/document/document/8786/surname.json"
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36'}
r = requests.models.Response
try:
    r = requests.get(url, headers=headers)
    r.raise_for_status()
    sn = pd.DataFrame.from_records(r.json())
    nh = sn.iloc[0]
    sn = sn[1:]
    sn.columns = nh
except requests.RequestException as e:
    print(f"Exception: {e}")
    quit(1)
except requests.JSONDecodeError as e:
    print(f"Exception: {e}")
    quit(2)
cars = pd.Series(fines.CarNumber.unique()).rename('CarNumber')
surnames = sn.NAME.sample(n=len(cars), random_state=21, replace=True, ignore_index=True).rename('SURNAME')
owners = pd.concat([cars, surnames], axis=1)
dict1 = {"CarNumber": ["Y163O8160RUS", "9184UT36RUS", "PY316E877RUS", "Y318P876RUS", "O05T8196RUS"],
        "Refund": [2, 1, 3, 1, 2],
        "Fines": [2300.00, 4500.00, 3600.00, 500.00, 1100.00],
        "Make": ["Renault", "Opel", "Hyundai", "Dodge", "Nissan"],
        "Model": ["Logan", "Astra", "Solaris", "Viper", "GT-X"],
        "Year": [2006, 2003, 2010, 2007, 2015]}
new_obs1 = pd.DataFrame(dict1)
fines = pd.concat([fines, new_obs1], axis=0, ignore_index=True)
owners.drop(labels=range(len(owners) - 20, len(owners)), axis=0, inplace=True)
dict2 = {"CarNumber": ["U761HY137RUS", "2367J8147RUS", "A0306X159RUS"],
         "SURNAME": ["BAKER", "LONG", "KING"]}
new_obs2 = pd.DataFrame(dict2)
owners = pd.concat([owners, new_obs2], axis=0, ignore_index=True)

In [234]:
print("dataframe should have only the car numbers that exist in both dataframes:")
merge1 = pd.merge(fines,owners,on='CarNumber', how='inner')
merge1

dataframe should have only the car numbers that exist in both dataframes:


Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989,RICHARDSON
1,Y163O8161RUS,2,1600.00,Ford,Focus,1980,RICHARDSON
2,E432XX77RUS,1,6500.00,Toyota,Camry,1995,ROSS
3,E432XX77RUS,2,13000.00,Toyota,Camry,2018,ROSS
4,7184TT36RUS,1,2100.00,Ford,Focus,1984,MORGAN
...,...,...,...,...,...,...,...
894,E41977152RUS,2,2400.00,Ford,Focus,1989,BAKER
895,9464EX178RUS,2,2100.00,Ford,Focus,1988,MARTIN
896,O50197197RUS,2,7800.00,Ford,Focus,1992,WRIGHT
897,7608EE777RUS,1,4000.00,Skoda,Octavia,2000,HILL


In [235]:
print("dataframe should have all the car numbers that exist in both dataframes:")
merge2 =  pd.merge(fines,owners,on='CarNumber', how='outer')
merge2

dataframe should have all the car numbers that exist in both dataframes:


Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1989.00,RICHARDSON
1,Y163O8161RUS,2.00,1600.00,Ford,Focus,1980.00,RICHARDSON
2,E432XX77RUS,1.00,6500.00,Toyota,Camry,1995.00,ROSS
3,E432XX77RUS,2.00,13000.00,Toyota,Camry,2018.00,ROSS
4,7184TT36RUS,1.00,2100.00,Ford,Focus,1984.00,MORGAN
...,...,...,...,...,...,...,...
928,Y318P876RUS,1.00,500.00,Dodge,Viper,2007.00,
929,O05T8196RUS,2.00,1100.00,Nissan,GT-X,2015.00,
930,U761HY137RUS,,,,,,BAKER
931,2367J8147RUS,,,,,,LONG


In [236]:
print("dataframe should have only the car numbers from the fines dataframe:")
merge3 = pd.merge(fines, owners, on='CarNumber', how='left')
merge3

dataframe should have only the car numbers from the fines dataframe:


Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989,RICHARDSON
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995,ROSS
2,7184TT36RUS,1,2100.00,Ford,Focus,1984,MORGAN
3,X582HE161RUS,2,2000.00,Ford,Focus,2015,BAILEY
4,92918M178RUS,1,5700.00,Ford,Focus,2014,LOPEZ
...,...,...,...,...,...,...,...
925,Y163O8160RUS,2,2300.00,Renault,Logan,2006,
926,9184UT36RUS,1,4500.00,Opel,Astra,2003,
927,PY316E877RUS,3,3600.00,Hyundai,Solaris,2010,
928,Y318P876RUS,1,500.00,Dodge,Viper,2007,


In [237]:
print("dataframe should have only the car numbers from the owners dataframe:")
merge4 = pd.merge(fines, owners, on='CarNumber', how='right')
merge4

dataframe should have only the car numbers from the owners dataframe:


Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1989.00,RICHARDSON
1,Y163O8161RUS,2.00,1600.00,Ford,Focus,1980.00,RICHARDSON
2,E432XX77RUS,1.00,6500.00,Toyota,Camry,1995.00,ROSS
3,E432XX77RUS,2.00,13000.00,Toyota,Camry,2018.00,ROSS
4,7184TT36RUS,1.00,2100.00,Ford,Focus,1984.00,MORGAN
...,...,...,...,...,...,...,...
897,7608EE777RUS,1.00,4000.00,Skoda,Octavia,2000.00,HILL
898,7608EE777RUS,1.00,300.00,Skoda,Octavia,1991.00,HILL
899,U761HY137RUS,,,,,,BAKER
900,2367J8147RUS,,,,,,LONG


## create a pivot table from the fines dataframe, it should look like this (the values are the sums of the fines), but with all the years (the values may be different for you)

In [238]:
table = pd.pivot_table(fines, index=["Make", "Model"], values=["Fines"], columns=["Year"], aggfunc=np.sum, fill_value="nan")
table

Unnamed: 0_level_0,Unnamed: 1_level_0,Fines,Fines,Fines,Fines,Fines,Fines,Fines,Fines,Fines,Fines,Fines,Fines,Fines,Fines,Fines,Fines,Fines,Fines,Fines,Fines,Fines
Unnamed: 0_level_1,Year,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Make,Model,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
Dodge,Viper,,,,,,,,,,,...,,,,,,,,,,
Ford,Focus,222094.59,415089.17,152883.76,80294.59,117389.17,123983.76,90894.59,254894.59,112894.59,75194.59,...,138689.17,142289.17,111900.0,304889.17,122294.59,217300.0,86394.59,254100.0,273094.59,106900.0
Ford,Mondeo,,,,,,,,,,8600.0,...,,,34400.0,,,,46200.0,,,
Hyundai,Solaris,,,,,,,,,,,...,3600.0,,,,,,,,,
Nissan,GT-X,,,,,,,,,,,...,,,,,,1100.0,,,,
Opel,Astra,,,,,,,,,,,...,,,,,,,,,,
Renault,Logan,,,,,,,,,,,...,,,,,,,,,,
Skoda,Octavia,10494.59,,16900.0,11594.59,,10294.59,600.0,5200.0,18100.0,91400.0,...,3100.0,500.0,500.0,12594.59,300.0,46394.59,300.0,4500.0,156200.0,9500.0
Toyota,Camry,12000.0,8594.59,,7200.0,,,,,,22400.0,...,,,8594.59,,39600.0,,,,31400.0,18100.0
Toyota,Corolla,,,2000.0,,,,,15600.0,,4000.0,...,24000.0,8594.59,8594.59,,,,10600.0,9600.0,8594.59,


## save both the fines and owners dataframes to CSV files without an index

In [239]:
fines.to_csv('../data/fines.csv', index=False)
owners.to_csv('../data/owners.csv', index=False)