# Introduction to MetaSynth with the titanic dataset

For this demonstration, we will use the raw Titanic dataset from pandas.

### Import relevant packages

In [1]:
import json
from pathlib import Path

import pandas as pd
import wget

from metasynth import MetaDataset

In [2]:
import numpy as np
import random
from faker import Faker

# Set a random seed so that the results are reproducible.
Faker.seed(12374098)
random.seed(1928374)
np.random.seed(28374812)

### Set the path to the dataset and download it

In [3]:
titanic_fp = Path("titanic.csv")
if not titanic_fp.is_file():
    wget.download("https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv")

### Define pandas types for use during the creation of the DataFrame

For categories, we need to explicitly tell pandas while reading the CSV file.

In [4]:
dtypes = {
    "Survived": "category",
    "Pclass": "category",
    "Name": "string",
    "Sex": "category",
    "SibSp": "category",
    "Parch": "category",
    "Ticket": "string",
    "Cabin": "string",
    "Embarked": "category"
}

### Read CSV file

In [5]:
df = pd.read_csv(titanic_fp, dtype=dtypes)

### Create MetaSynth dataset object

This object contains a list of all the variables/columns. These variables are fitted to have a distribution that fits the best to the original data.

In [6]:
dataset = MetaDataset.from_dataframe(df, spec={"Cabin": {"distribution": "regex",
                                                         "fit_kwargs": {"mode": "fast"}}
                                              })

Variable PassengerId seems unique, but not set to be unique.



### Original DataFrame

In [7]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### Synthesized DataFrame

In [8]:
dataset.synthesize(1000)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,88,1,3,"Q =2Q2']JOl""1GH8 gk9kKnA!(X6]p;$",male,19.197794,0,0,4.2SR~,12.428636,,S
1,726,0,3,"e""6;X[#2|]e;TK3LB@Hej@_C:ocyChbXdX'_{l 5S 75Ed",female,44.093614,1,0,40.0EoH#'mt,74.775229,,S
2,30,0,3,"xp))Qq#&Wxtu*4eR6?2N7>cxj} [,$xW/RL(Oxi)j?7...",male,34.072014,1,0,2503,3.980382,E`5l5tcOuI,S
3,143,1,3,}Djl~Fn1`S1Q\LpO'y:~jbUkA9A,female,32.208394,0,0,"IBP1871@R'""(h",5.858010,,S
4,751,1,1,"K3y=p\SJrMIigt,^P_DkI!B}6F|sBDr}*(=f}Ft",female,38.687568,0,0,7UE9E1.n#,7.572225,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
995,825,0,1,"{`Jm>y!m}JTdepLhzdg^~ q|5!?`y8	,<B0NHr]%#]@*6...",male,31.477607,1,0,J9744eQ,9.350190,,S
996,495,0,2,v hMKBB=uHm }TF~|,male,,0,0,JTQTE .85,112.305122,,C
997,177,0,3,"2""x8e/zl6%wFy:[qjUTk4b	.`/}*%sH}Ackfm`+%;.jIs-OS|",male,27.210183,1,0,2 77SM=-=,21.536089,,S
998,632,0,2,M5ir5 &SI3yn0 1gH'8H7x<Z;oO3F2u#Gf[q.<`sMlq@...,male,55.890063,0,0,190(E}U?,16.071356,,S


### Write the dataset to a meta-data file

This file can be retrieved later to produce the synthetic data, so instead of sharing the synthetic data, one could share this file instead to provide the most amount of statistical information.

In [9]:
dataset.to_json("test.json", validate=True)

### Print the contents of the dataset that gets loaded from a MetaSynth file.

In [10]:
print(MetaDataset.from_json("test.json"))

# Rows: 891
# Columns: 12

{'name': 'PassengerId', 'description': None, 'type': 'discrete', 'dtype': 'int64', 'prop_missing': 0.0, 'distribution': "{'name': 'DiscreteUniformDistribution', 'parameters': {'low': 1, 'high': 892}}"}

{'name': 'Survived', 'description': None, 'type': 'categorical', 'dtype': 'category', 'prop_missing': 0.0, 'distribution': "{'name': 'MultinoulliDistribution', 'parameters': {'labels': ['0', '1'], 'probs': [0.6161616161616161, 0.3838383838383838]}}"}

{'name': 'Pclass', 'description': None, 'type': 'categorical', 'dtype': 'category', 'prop_missing': 0.0, 'distribution': "{'name': 'MultinoulliDistribution', 'parameters': {'labels': ['1', '2', '3'], 'probs': [0.24242424242424243, 0.20650953984287318, 0.5510662177328844]}}"}

{'name': 'Name', 'description': None, 'type': 'string', 'dtype': 'string', 'prop_missing': 0.0, 'distribution': '.[]{12,82}'}

{'name': 'Sex', 'description': None, 'type': 'categorical', 'dtype': 'category', 'prop_missing': 0.0, 'distributio

### Raw contents of the variables

In [11]:
# NBVAL_IGNORE_OUTPUT

dataset.to_dict()

{'n_rows': 891,
 'n_columns': 12,
 'provenance': {'created by': {'name': 'MetaSynth',
   'version': '0.1.2+0.g0f9262f.dirty',
   'privacy': None},
  'creation time': '2022-10-20T17:51:20.995282'},
 'vars': [{'name': 'PassengerId',
   'type': 'discrete',
   'dtype': 'int64',
   'prop_missing': 0.0,
   'distribution': {'name': 'DiscreteUniformDistribution',
    'parameters': {'low': 1, 'high': 892}}},
  {'name': 'Survived',
   'type': 'categorical',
   'dtype': 'category',
   'prop_missing': 0.0,
   'distribution': {'name': 'MultinoulliDistribution',
    'parameters': {'labels': array(['0', '1'], dtype='<U1'),
     'probs': array([0.61616162, 0.38383838])}}},
  {'name': 'Pclass',
   'type': 'categorical',
   'dtype': 'category',
   'prop_missing': 0.0,
   'distribution': {'name': 'MultinoulliDistribution',
    'parameters': {'labels': array(['1', '2', '3'], dtype='<U1'),
     'probs': array([0.24242424, 0.20650954, 0.55106622])}}},
  {'name': 'Name',
   'type': 'string',
   'dtype': 'str