# Introduction to MetaSynth with the titanic dataset

For this demonstration, we will use the raw Titanic dataset from pandas.

### Import relevant packages

In [1]:
import json
from pathlib import Path

import pandas as pd
import wget

from metasynth import MetaDataset

### Set the path to the dataset and download it

In [2]:
titanic_fp = Path("titanic.csv")
if not titanic_fp.is_file():
    wget.download("https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv")

### Define pandas types for use during the creation of the DataFrame

For categories, we need to explicitly tell pandas while reading the CSV file.

In [3]:
dtypes = {
    "Survived": "category",
    "Pclass": "category",
    "Name": "string",
    "Sex": "category",
    "SibSp": "category",
    "Parch": "category",
    "Ticket": "string",
    "Cabin": "string",
    "Embarked": "category"
}

### Read CSV file

In [4]:
df = pd.read_csv(titanic_fp, dtype=dtypes)

### Create MetaSynth dataset object

This object contains a list of all the variables/columns. These variables are fitted to have a distribution that fits the best to the original data.

In [5]:
dataset = MetaDataset.from_dataframe(df, spec={"Cabin": {"distribution": "regex",
                                                         "fit_kwargs": {"mode": "fast"}}
                                              })

Variable PassengerId seems unique, but not set to be unique.



[<metasynth.distribution.discrete.DiscreteUniformDistribution object at 0x7fb3a5b47f40>, <metasynth.distribution.discrete.PoissonDistribution object at 0x7fb3a5b2a050>]


### Original DataFrame

In [6]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### Synthesized DataFrame

In [7]:
dataset.synthesize(1000)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,171,1,3,`F@<4?ctRDs8 EpC?y.cxIVz	zEle 9Q2I ^EV?u in,male,49.551210,0,0,E M8.|,3.970840,,S
1,202,1,3,oa6o >%Dbph,male,35.185803,0,0,5931d\\iGVO,44.017150,"T0$YIOa;D,1Xo",S
2,179,0,3,y\9i6$R.[ `+1-d41tO`v4%lZIA 7D3{DlMncWR#G| CA...,male,68.956672,1,0,PJEIB21 #a\#(~yq,4.537154,B;[}genU~{,C
3,406,0,2,"[N3za%4-)l]/QIR$e jX43}{7t~Sq[,mb>#):3'nH6,^...",male,25.291475,0,0,9.5|7Ux*,16.592622,,Q
4,205,0,3,WF{t^d4am#(U?EXp:#-m\lj-2Cm3u~9~u8|!,female,11.265102,0,0,79635iw5C0,78.494063,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
995,181,0,3,"@.~QVYxi@7	L4c2lN0z3M;O(0@U ? aS;%_ZvE$3LNE, l...",male,25.776789,0,0,"9YHXOS/02 2~,;1f",60.359712,,S
996,773,0,3,"]' 6#/Ro),/yjs0Z-;EDcn?r%l;|aZ91k.EPEG g]\E^Go+",male,33.348625,0,0,7247,11.964355,,C
997,483,0,1,tOUW#Yb3$\^xMl	yD1y6ThP	h.flw\6,female,21.745360,0,0,ME7 50%?!0aq,78.767598,,C
998,697,0,3,I2pkZC>AcKqC%7K(4aS?'i=aD.y,male,,1,0,2607o7m]XGn`,5.894421,,S


### Write the dataset to a meta-data file

This file can be retrieved later to produce the synthetic data, so instead of sharing the synthetic data, one could share this file instead to provide the most amount of statistical information.

In [8]:
dataset.to_json("test.json", validate=True)

### Print the contents of the dataset that gets loaded from a MetaSynth file.

In [9]:
print(MetaDataset.from_json("test.json"))

# Rows: 891
# Columns: 12

{'name': 'PassengerId', 'description': None, 'type': 'MetaVar', 'dtype': 'int64', 'prop_missing': 0.0, 'distribution': "{'name': 'DiscreteUniformDistribution', 'parameters': {'low': 1, 'high': 892}}"}

{'name': 'Survived', 'description': None, 'type': 'MetaVar', 'dtype': 'category', 'prop_missing': 0.0, 'distribution': "{'name': 'MultinoulliDistribution', 'parameters': {'labels': ['0', '1'], 'probs': [0.6161616161616161, 0.3838383838383838]}}"}

{'name': 'Pclass', 'description': None, 'type': 'MetaVar', 'dtype': 'category', 'prop_missing': 0.0, 'distribution': "{'name': 'MultinoulliDistribution', 'parameters': {'labels': ['1', '2', '3'], 'probs': [0.24242424242424243, 0.20650953984287318, 0.5510662177328844]}}"}

{'name': 'Name', 'description': None, 'type': 'MetaVar', 'dtype': 'string', 'prop_missing': 0.0, 'distribution': '.[]{12,82}'}

{'name': 'Sex', 'description': None, 'type': 'MetaVar', 'dtype': 'category', 'prop_missing': 0.0, 'distribution': "{'name'

### Raw contents of the variables

In [10]:
dataset.to_dict()

{'n_rows': 891,
 'n_columns': 12,
 'provenance': {'created by': {'name': 'MetaSynth',
   'version': '0+untagged.23.g37a09d8.dirty',
   'privacy': None},
  'creation time': '2022-08-08T11:38:02.834954'},
 'vars': [{'name': 'PassengerId',
   'type': 'discrete',
   'dtype': 'int64',
   'prop_missing': 0.0,
   'distribution': {'name': 'DiscreteUniformDistribution',
    'parameters': {'low': 1, 'high': 892}}},
  {'name': 'Survived',
   'type': 'categorical',
   'dtype': 'category',
   'prop_missing': 0.0,
   'distribution': {'name': 'MultinoulliDistribution',
    'parameters': {'labels': array(['0', '1'], dtype='<U1'),
     'probs': array([0.61616162, 0.38383838])}}},
  {'name': 'Pclass',
   'type': 'categorical',
   'dtype': 'category',
   'prop_missing': 0.0,
   'distribution': {'name': 'MultinoulliDistribution',
    'parameters': {'labels': array(['1', '2', '3'], dtype='<U1'),
     'probs': array([0.24242424, 0.20650954, 0.55106622])}}},
  {'name': 'Name',
   'type': 'string',
   'dtype'