# Synthetic Data Generation
This notebook gives an example of how we can generate synthetic data to get a dataset similar to an original dataset with sensitive information.

In [None]:
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt

## Load and Inspect Original Data

In [None]:
titanic = pd.read_csv('titanic.csv')

In [None]:
titanic.head()

Inspect the original data:

In [None]:
# the histogram of the data
plt.figure(figsize=(5, 3))
n, bins, patches = plt.hist(titanic['Age'], 30, density = True, alpha=0.75, label='Raw Data')

plt.grid(True)
plt.legend()
plt.xlabel('Age')
plt.ylabel('Ratio')
plt.title('Age of Titanic Passengers')
plt.show()

## Fit normal distribution to the data
For this toy example, we assume the age of the passengers is normally distributed, and we use the original data to infer to parameters of a normal distribution:

In [None]:
(mu, sigma) = norm.fit(titanic['Age'].dropna())

Now we can plot the original data along with the fitted distribution and the synthetic sample distribution:

In [None]:
plt.figure(figsize=(5, 3))

# the histogram of the data
n, bins, patches = plt.hist(titanic['Age'], 30, density = True, alpha=0.75, label='Raw Data')

# add a 'best fit' line
y = norm.pdf( bins, mu, sigma)
l = plt.plot(bins, y, 'r--', linewidth=2, label='Fitted Distribution')

# plot synthetic data samples
r = norm.rvs(size=1000) # standard normal distribution
synth_ages = mu + sigma*r
n, bins, patches = plt.hist(synth_ages, 30, density = True, color='green', alpha=0.75, label='Synthetic Data')

# make plot nicer:
plt.grid(True)
plt.legend()
plt.xlabel('Age')
plt.ylabel('Ratio')
plt.title('Age of Titanic Passengers')
plt.show()