In [12]:
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.layouts import column, row
from bokeh.models import ColumnDataSource, HoverTool
#from bokeh.models.widgets import Panel, Tabs
from bokeh.transform import factor_cmap
from bokeh.palettes import Spectral6
import numpy as np



df = pd.read_csv('Titanic/Titanic-Dataset.csv')

#df.isna().sum()

# Handle missing values
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Cabin'].fillna('Unknown', inplace=True)
df['Embarked'].fillna('N/A', inplace=True)

df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [13]:
# - Create a new column AgeGroup to categorize passengers into age groups (e.g.,
# Child, Young Adult, Adult, Senior).
# - Create a SurvivalRate column to calculate the percentage of passengers who
# survived within each group.
df['AgeGroup'] = np.where(df['Age'] < 11, 'Child',
                    np.where(df['Age'] < 20, 'Young Adult', 
                             np.where(df['Age'] < 60, 'Adult', 'Senior')
                            )
                         )

# Group by Pclass, Sex, AgeGroup
grouped_df = df.groupby(['Pclass', 'Sex', 'AgeGroup'])['Survived'].mean().reset_index()
#display(grouped_df)

# Make percentage value
grouped_df['SurvivalRate'] = (grouped_df['Survived'] * 100).round(2)

# Merge SurvivalRate dataframe (grouped_df) with original dataframe (df)
df_survived = pd.merge(df, grouped_df[['Pclass', 'Sex', 'AgeGroup', 'SurvivalRate']], 
              on=['Pclass', 'Sex', 'AgeGroup'], how='left')

display(df_survived)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup,SurvivalRate
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,Unknown,S,Adult,12.50
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Adult,97.40
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,Unknown,S,Adult,47.96
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Adult,97.40
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,Unknown,S,Adult,12.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,Unknown,S,Adult,7.06
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,Young Adult,100.00
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,Unknown,S,Adult,47.96
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,Adult,38.61


In [14]:
# import pandas as pd
# from bokeh.plotting import figure, show
# from bokeh.io import output_notebook
# from bokeh.models import ColumnDataSource, HoverTool
# from bokeh.transform import factor_cmap
# from bokeh.palettes import Spectral6

# # Output the plot to the notebook (if you're using a Jupyter Notebook)
# output_notebook()

In [15]:
age_group_survival = df_survived.copy()


# Calculate survival rates for each AgeGroup
age_group_survival = df.groupby('AgeGroup')['Survived'].mean() * 100  # Convert to percentage
age_group_survival = age_group_survival.reset_index()

# Convert the categorical AgeGroup to a string list for Bokeh
age_group_survival['AgeGroup'] = age_group_survival['AgeGroup'].astype(str)

#display(age_group_survival)

source = ColumnDataSource(age_group_survival)

# Ensure the x_range is unique and ordered
unique_age_groups = age_group_survival['AgeGroup'].unique().tolist()

# Convert the data to a ColumnDataSource for Bokeh
source = ColumnDataSource(age_group_survival)


# Create the figure with string categories for x_range
p = figure(x_range=unique_age_groups, height=500, width=700, title="Survival Rates by Age Group",
           toolbar_location=None, tools="")

# Add bars to the plot
p.vbar(x='AgeGroup', top='Survived', width=0.9, source=source, color='skyblue')

# Add hover tool
hover = HoverTool()
hover.tooltips = [
    ("Age Group", "@AgeGroup"),
    ("Survival Rate", "@Survived{0.2f}%")
]
p.add_tools(hover)

# Customize the plot
p.y_range.start = 0
p.yaxis.axis_label = "Survival Rate (%)"
p.xaxis.axis_label = "Age Group"

# Show the plot
show(p)

In [16]:
# import pandas as pd
# from bokeh.plotting import figure, show, output_file
# from bokeh.transform import factor_cmap
# from bokeh.models import ColumnDataSource, FactorRange
# from bokeh.io import output_notebook

# output_notebook()

# df = pd.read_csv('/Users/aprokopiv/moodlecloud/moodlecloud_visualization/Titanic/Titanic-Dataset.csv')

In [17]:
# Calculate survival rates
survival_rates = df_survived.groupby(['Pclass', 'Sex'])['Survived'].mean().unstack()

# Convert the DataFrame to a format that Bokeh can use (ColumnDataSource)
survival_rates = survival_rates.reset_index()
survival_rates = pd.melt(survival_rates, id_vars=['Pclass'], value_vars=['male', 'female'], var_name='Sex', value_name='Survival_Rate')

In [18]:
# Prepare data for Bokeh
survival_rates['Pclass_Sex'] = list(zip(survival_rates['Pclass'].astype(str), survival_rates['Sex']))
source = ColumnDataSource(survival_rates)

# Define the factors for x-axis
factors = [(str(cls), gender) for cls in [1, 2, 3] for gender in ['male', 'female']]

In [19]:
# Create a figure
p = figure(x_range=FactorRange(*factors), height=400, width=700, title="Survival Rates by Class and Gender",
           toolbar_location=None, tools="")

# Add bars
p.vbar(x='Pclass_Sex', top='Survival_Rate', width=0.8, source=source,
       fill_color=factor_cmap('Pclass_Sex', palette=['lightblue', 'lightgreen'], factors=['male', 'female'], start=1, end=2))

# Add hover tool
hover = HoverTool()
hover.tooltips = [
    ("Class", "@Pclass"),
    ("Gender", "@Sex"),
    ("Survival Rate", "@Survival_Rate{0.00%}")
]
p.add_tools(hover)

# Customize the plot
p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.yaxis.axis_label = 'Survival Rate'
p.xaxis.axis_label = 'Class and Gender'
p.xaxis.major_label_orientation = 1

# Show the plot
show(p)



# # Create filtering widgets
# class_select = Select(title="Select Class:", value="All", options=["All", "1", "2", "3"])
# gender_select = CheckboxGroup(labels=["Male", "Female"], active=[0, 1])

# # Define the update function
# def update():
#     selected_class = class_select.value
#     selected_genders = [gender_select.labels[i] for i in gender_select.active]

#     # Filter the data based on the selected class and gender
#     filtered_data = survival_rates[
#         ((survival_rates['Pclass'] == selected_class) | (selected_class == "All")) &
#         (survival_rates['Sex'].isin(selected_genders))
#     ]
    
#     # Update the data in the ColumnDataSource
#     source.data = ColumnDataSource.from_df(filtered_data)

#     # Update x_range based on filtered data
#     p.x_range.factors = filtered_data['Pclass_Sex'].unique().tolist()

# # Add callbacks
# class_select.on_change('value', lambda attr, old, new: update())
# gender_select.on_change('active', lambda attr, old, new: update())

# # Arrange the plot and widgets in a layout
# layout = column(row(class_select, gender_select), p)

# # Add the layout to the current document (use curdoc() in a Bokeh server or show(layout) in a script)
# curdoc().add_root(layout)

NameError: name 'FactorRange' is not defined

In [20]:
df_fare_survival = df_survived.copy()

df_fare_survival['Pclass'] = df_fare_survival['Pclass'].astype(str)



# Create a ColumnDataSource
source = ColumnDataSource(df_fare_survival)

# Create a color mapping for the different classes
color_map = factor_cmap('Pclass', palette=['blue', 'green', 'red'], factors=['1', '2', '3'])

# Create the figure
p = figure(width=800, height=400, title="Scatter Plot of Fare vs Survival Status by Class",
           x_axis_label="Fare", y_axis_label="Survived", tools="pan,box_zoom,reset,save")

# Add the scatter plot
p.circle(x='Fare', y='Survived', size=10, color=color_map, legend_field='Pclass', source=source, fill_alpha=0.4)

# Add hover tool
hover = HoverTool()
hover.tooltips = [
    ("Passenger Class", "@Pclass"),
    ("Fare", "@Fare{0.2f}"),
    ("Survived", "@Survived"),
    ("Sex", "@Sex"),
    ("Age", "@Age"),
    ("Name", "@Name")
]

p.add_tools(hover)

# Customize the legend
p.legend.title = 'Class'
p.legend.location = 'top_right'

# Show the plot
show(p)