In [8]:
"""Importing all relevant packages."""

import pandas as pd
from bokeh.core.properties import value
from bokeh.io import show, output_file
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
from bokeh.transform import dodge
from bokeh.models import Panel, Tabs
from bokeh.io import output_file, show
from bokeh.plotting import figure
import time

In [10]:
"""Reading the relevants datasets into a Pandas dataframes."""

start_time = time.time()

birth_data = pd.read_csv("Datasets/Birth.csv")
marriage_data = pd.read_csv("Datasets/Marriage.csv")
death_data = pd.read_csv("Datasets/Death.csv")

print("--- %s seconds ---" % (time.time() - start_time))

--- 25.59538507461548 seconds ---


In [3]:
"""Preprocessing the death data and adjusting the dataframe. Since, there are a few years in the beginning 
which have very little count, I consider them as outliers and that is why I have removed them. """

df_year = birth_data[["Year"]]
print("Size of the whole dataset:",len(df_year))
df_year = df_year.dropna()
print("Size of the dataset after dropping NaNs:",len(df_year))

year_freq = df_year.Year.value_counts().to_frame()  # new dataframe for births per year

year_freq = year_freq.rename(columns={"Year": "Count"})
year_freq["Year"] = year_freq.index
year_freq.sort_values(by=['Year'], inplace=True)  # sorting the years in ascending order
year_freq = year_freq.reset_index()
year_freq = year_freq.drop(["index"], axis=1)
year_freq["Year"] = year_freq["Year"].astype(int)

year_freq = year_freq[year_freq["Count"]>50]  # removing years with less than 50 deaths
year_freq = year_freq.reset_index()
year_freq = year_freq.drop(["index"], axis=1)

Size of the whole dataset: 2456738
Size of the dataset after dropping NaNs: 820705


In [4]:
"""Preprocessing the marriage data, keeping only the variables that are needed for the visualization purpose
and adjusting the dataframe."""

df_marriage = marriage_data[["Person id", "Connection", "Event Year"]]  # keeping only relevant variables

print("Amount of people that the data shows are married to themselves:",
      len(df_marriage[df_marriage["Person id"] == df_marriage["Connection"]]))

df_marriage = df_marriage[df_marriage["Person id"] != df_marriage["Connection"]]  # removing self-married people

df_marriage["Year"] = df_marriage["Event Year"]

marriage_freq = df_marriage.Year.value_counts().to_frame()  # new dataframe for marriages per year

marriage_freq = marriage_freq.rename(columns={"Year": "Count"})
marriage_freq["Year"] = marriage_freq.index
marriage_freq.sort_values(by=['Year'], inplace=True)  # sorting the years in ascending order
marriage_freq = marriage_freq.reset_index()
marriage_freq = marriage_freq.drop(["index"], axis=1)
marriage_freq["Year"] = marriage_freq["Year"].astype(int)

marriage_freq = marriage_freq[marriage_freq["Count"]>50]  # removing years with less than 50 marriages
marriage_freq = marriage_freq.reset_index()
marriage_freq = marriage_freq.drop(["index"], axis=1)

Amount of people that the data shows are married to themselves: 480461


In [5]:
"""Preprocessing the death data and adjusting the dataframe. Since, there are a few years in the beginning 
which have very little count, I consider them as outliers and that is why I have removed them. """

df_death = death_data[["Year"]]
print("Size of the whole dataset:",len(df_death))
df_death = death_data.dropna()  # removing all missing values
print("Size of the dataset after dropping NaNs:",len(df_death))

death_freq = df_death.Year.value_counts().to_frame()  # new dataframe for deaths per year

death_freq = death_freq.rename(columns={"Year": "Count"})
death_freq["Year"] = death_freq.index
death_freq.sort_values(by=['Year'], inplace=True)  # sorting the years in ascending order
death_freq = death_freq.reset_index()
death_freq = death_freq.drop(["index"], axis=1)
death_freq["Year"] = death_freq["Year"].astype(int)

death_freq = death_freq[death_freq["Count"]>50]  # removing years with less than 50 marriages
death_freq = death_freq.reset_index()
death_freq = death_freq.drop(["index"], axis=1)

Size of the whole dataset: 440339
Size of the dataset after dropping NaNs: 173592


In [6]:
"""Making an interactive timeline visualization using the Bokeh library."""

output_file("bars.html")

years = year_freq["Year"].tolist()
year_counts = year_freq["Count"].tolist()
marriages = marriage_freq["Year"].tolist()
marriage_counts = marriage_freq["Count"].tolist()
deaths = death_freq["Year"].tolist()
death_counts = death_freq["Count"].tolist()

  # Creating a timeline bar chart for each dataframe preprocessed above.
p = figure(title="Timeline of births every year", toolbar_location=None, tools="", plot_height=700, plot_width=1200,
          y_axis_label = "Number of people", x_axis_label = "Year")
p.vbar(x=years, top=year_counts, width=0.7)
p.xgrid.grid_line_color = None
p.y_range.start = 0


p1 = figure(title="Timeline of marriages every year",
           toolbar_location=None, tools="", plot_height=700, plot_width=1200,
           y_axis_label = "Number of people", x_axis_label = "Year")
p1.vbar(x=marriages, top=marriage_counts, width=0.7)
p1.xgrid.grid_line_color = None
p1.y_range.start = 0


p2 = figure(title="Timeline of deaths every year",
           toolbar_location=None, tools="", plot_height=700, plot_width=1200,
           y_axis_label = "Number of people", x_axis_label = "Year")
p2.vbar(x=deaths, top=death_counts, width=0.7)
p2.xgrid.grid_line_color = None
p2.y_range.start = 0

  # Creating a tab for each visualization
tab = Panel(child=p, title="Births per year")
tab1= Panel(child=p1, title="Marriages per year")
tab2= Panel(child=p2, title="Deaths per year")

tabs = Tabs(tabs=[tab,tab1,tab2])

show(tabs)