# Introduction to metasyn with the titanic dataset

For this demonstration, we will use the raw Titanic dataset from pandas.

### Import relevant packages

In [1]:
import json
from pathlib import Path

import pandas as pd
import wget

from metasyn import MetaFrame

### Set the path to the dataset and download it

In [2]:
titanic_fp = Path("titanic.csv")
if not titanic_fp.is_file():
    wget.download("https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv")

### Define pandas types for use during the creation of the DataFrame

For categories, we need to explicitly tell pandas while reading the CSV file.

In [3]:
dtypes = {
    "Survived": "category",
    "Pclass": "category",
    "Name": "string",
    "Sex": "category",
    "SibSp": "category",
    "Parch": "category",
    "Ticket": "string",
    "Cabin": "string",
    "Embarked": "category"
}

### Read CSV file

In [4]:
df = pd.read_csv(titanic_fp, dtype=dtypes)

### Create metasyn dataset object

This object contains a list of all the variables/columns. These variables are fitted to have a distribution that fits the best to the original data.

In [5]:
dataset = MetaFrame.fit_dataframe(df)

Variable PassengerId seems unique, but not set to be unique.

100%|███████████████████████████████████████████████████████████████| 12/12 [00:26<00:00,  2.22s/it]


### Original DataFrame

In [6]:
MetaFrame.estimated_time(df, None, None)

2.968698309247743

### Synthesized DataFrame

In [7]:
dataset.synthesize()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,cat,cat,str,cat,f64,cat,cat,str,f64,str,cat
714,"""1""","""1""","""Auaacz, Meszk.…","""female""",,"""1""","""0""","""968978""",86.238248,,"""S"""
274,"""0""","""3""","""Mfatlu, Mqry. …","""female""",28.370083,"""0""","""0""","""0605""",20.352118,,"""S"""
608,"""0""","""3""","""Zhq, Mflo. Xov…","""male""",13.099365,"""0""","""0""","""U 262182""",20.764172,,"""C"""
44,"""0""","""1""","""Xwq, Mtavb. Rk…","""male""",61.662972,"""0""","""0""","""8810""",47.745505,"""F76""","""S"""
117,"""1""","""3""","""Kwh, Ml. Gviuy…","""female""",24.791549,"""1""","""0""","""26599""",7.906261,"""E8""","""S"""
317,"""1""","""3""","""Tglz, Mtpzjh. …","""male""",17.659675,"""0""","""0""","""82936""",90.329764,,"""C"""
384,"""0""","""1""","""Uhviqiplug, Mv…","""male""",61.935334,"""1""","""0""","""684348""",8.581887,,"""S"""
192,"""0""","""1""","""Ogle, Mkklb. L…","""female""",,"""0""","""2""","""3478""",53.809264,"""E1""","""S"""
605,"""1""","""3""","""Msdnlqcl, Mime…","""female""",28.02694,"""0""","""0""","""497994""",12.902887,,"""S"""
758,"""1""","""2""","""Baopz, Masdf. …","""male""",36.006388,"""0""","""0""","""0904""",1.528154,,"""S"""


### Write the dataset to a meta-data file

This file can be retrieved later to produce the synthetic data, so instead of sharing the synthetic data, one could share this file instead to provide the most amount of statistical information.

In [8]:
dataset.to_json("test.json", validate=True)

### Print the contents of the dataset that gets loaded from a metasyn file.

In [9]:
print(MetaFrame.from_json("test.json"))

# Rows: 891
# Columns: 12

Column 1: "PassengerId"
- Variable Type: discrete
- Data Type: Int64
- Proportion of Missing Values: 0.0000
- Distribution:
	- Type: core.discrete_uniform
	- Provenance: builtin
	- Parameters:
		- low: 1
		- high: 892
	

Column 2: "Survived"
- Variable Type: categorical
- Data Type: Categorical
- Proportion of Missing Values: 0.0000
- Distribution:
	- Type: core.multinoulli
	- Provenance: builtin
	- Parameters:
		- labels: ['0' '1']
		- probs: [0.61616162 0.38383838]
	

Column 3: "Pclass"
- Variable Type: categorical
- Data Type: Categorical
- Proportion of Missing Values: 0.0000
- Distribution:
	- Type: core.multinoulli
	- Provenance: builtin
	- Parameters:
		- labels: ['1' '2' '3']
		- probs: [0.24242424 0.20650954 0.55106622]
	

Column 4: "Name"
- Variable Type: string
- Data Type: Utf8
- Proportion of Missing Values: 0.0000
- Distribution:
	- Type: core.regex
	- Provenance: builtin
	- Parameters:
		- regex: [A-Z][a-z]{2,9}[,][ ][M][a-z]{1,5}[\.][ ][A-Z][a

### Raw contents of the variables

In [10]:
dataset.to_dict()

{'n_rows': 891,
 'n_columns': 12,
 'provenance': {'created by': {'name': 'metasyn', 'version': '0.6.0'},
  'creation time': '2023-10-09T14:50:06.552768'},
 'vars': [{'name': 'PassengerId',
   'type': 'discrete',
   'dtype': 'Int64',
   'prop_missing': 0.0,
   'distribution': {'implements': 'core.discrete_uniform',
    'version': '1.0',
    'provenance': 'builtin',
    'class_name': 'DiscreteUniformDistribution',
    'parameters': {'low': 1, 'high': 892}}},
  {'name': 'Survived',
   'type': 'categorical',
   'dtype': 'Categorical',
   'prop_missing': 0.0,
   'distribution': {'implements': 'core.multinoulli',
    'version': '1.0',
    'provenance': 'builtin',
    'class_name': 'MultinoulliDistribution',
    'parameters': {'labels': array(['0', '1'], dtype=object),
     'probs': array([0.61616162, 0.38383838])}}},
  {'name': 'Pclass',
   'type': 'categorical',
   'dtype': 'Categorical',
   'prop_missing': 0.0,
   'distribution': {'implements': 'core.multinoulli',
    'version': '1.0',
   