# Data Analysis

In [11]:
import json
import pandas as pd
from pathlib import Path

In [12]:
data_path = Path('..') / 'data'
dfs = {f.name: pd.read_csv(f) for f in data_path.glob('**/*.csv')}
sample_output = json.load(open(data_path / 'misc' / 'sample_output.json'))

## Stats for People data

In [13]:
df_people = dfs['people.csv']
df_people.head()

Unnamed: 0,given_name,family_name,date_of_birth,place_of_birth
0,John,Williams,1842-09-30,Dumfries
1,Grace,Jeffery,1899-06-14,Kelso
2,Sean,Molnar,1982-11-01,Dromore
3,Lily,Doyle,1883-04-02,Hamilton
4,Edith,Styles,1879-07-24,Ballymoney


In [14]:
df_people.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   given_name      10000 non-null  object
 1   family_name     10000 non-null  object
 2   date_of_birth   10000 non-null  object
 3   place_of_birth  10000 non-null  object
dtypes: object(4)
memory usage: 312.6+ KB


The dataframe has five columns, all of which appear to have string values with no outliers. Let's check the max length of those strings.

In [15]:
df_people.apply(lambda c: c.str.len(), axis=0).max()

given_name        11
family_name       23
date_of_birth     10
place_of_birth    16
dtype: int64

## Stats for Places data

In [16]:
df_places = dfs['places.csv']
df_places.head()

Unnamed: 0,city,county,country
0,Aberdeen,Aberdeenshire,Scotland
1,Airdrie,Lanarkshire,Scotland
2,Alloa,Clackmannanshire,Scotland
3,Annan,Dumfriesshire,Scotland
4,Anstruther,Fife,Scotland


In [17]:
df_places.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   city     114 non-null    object
 1   county   114 non-null    object
 2   country  114 non-null    object
dtypes: object(3)
memory usage: 2.8+ KB


The dataframe has three columns, all of which appear to have string values with no outliers. Let's check the max length of those strings.

In [18]:
df_places.apply(lambda c: c.str.len(), axis=0).max()

city       16
county     18
country    16
dtype: int64

## Let's take a look at the sample date

In [19]:
dfs['example.csv'].head()

Unnamed: 0,name
0,Fred
1,Daphne
2,Velma
3,Shaggy
4,Scooby


In [20]:
sample_output

{'Scotland': 8048, 'Northern Ireland': 1952}

### Replicate the result

In [21]:
df_people.merge(df_places, left_on='place_of_birth', right_on='city').groupby('country')['country'].count().to_json()

'{"Northern Ireland":1952,"Scotland":8048}'