In [8]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
# Counting the occurrences of unique values in the 'Survived' column of the DataFrame.
# The output is then converted to a DataFrame with columns renamed for clarity:
# 'Survived' indicates the unique values from the original column (0 for not survived, 1 for survived),
# and 'Count' shows the number of occurrences of each unique value.

counts = df['Survived'].value_counts()
counts_df = counts.reset_index()
counts_df.columns = ["Survived", "Count"]

# Replacing 0 with "Did not survive" and 1 with "Survive".

counts_df['Survived'] = counts_df['Survived'].replace({0:"Did not survive", 1:"Survived"})
counts_df

Unnamed: 0,Survived,Count
0,Did not survive,549
1,Survived,342


In [21]:
# Create the bar chart using Plotly Express
# Use the 'Survived' column to assign different colors to each bar
fig = px.bar(counts_df, x='Survived', y='Count', title='Survival Count in the Titanic', color='Survived')

# Customize axes labels
fig.update_layout(
    xaxis_title="Survival Status",
    yaxis_title="Count",
    xaxis={'categoryorder':'total descending'}
)

# Update layout for a nicer look
fig.update_layout(showlegend=True)  

# Show the plot
fig.show()

In [18]:
# Assuming counts_df is your DataFrame with 'Survived' and 'Count' columns
fig = px.pie(counts_df, values='Count', names='Survived', title='Survival Distribution on the Titanic')

# Customize colors
colors = ['orange', 'green']  # red for "Did not survive", green for "Survive"
fig.update_traces(marker=dict(colors=colors))

# Pull out the smaller slice by specifying its index
# For example, if the smaller slice is the first one (index 0), you can do:
fig.update_traces(pull=[0.1, 0])  # Pulls out the first slice slightly; adjust as needed

# Show the plot
fig.show()

In [26]:
df_survival_sex = df[['Survived','Sex']]
encoding_map = {
    'male' : 1,
    'female' : 2
}

df_survival_sex['Sex'] = df_survival_sex['Sex'].replace(encoding_map)
df_survival_sex



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Survived,Sex
0,0,1
1,1,2
2,1,2
3,1,2
4,0,1
...,...,...
886,0,1
887,1,2
888,0,2
889,1,1


In [27]:
import numpy as np
from scipy.stats import chi2_contingency

def correlation_ratio(categories, measurements):
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat) + 1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0, cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.mean(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array, n_array)) / np.sum(n_array)
    numerator = np.sum(np.multiply(n_array, np.power(np.subtract(y_avg_array, y_total_avg), 2)))
    denominator = np.sum(np.power(np.subtract(measurements, y_total_avg), 2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = np.sqrt(numerator / denominator)
    return eta

eta = correlation_ratio(df_survival_sex['Sex'], df_survival_sex['Survived'])
print("Correlation Ratio:", eta)

Correlation Ratio: 0.5433513806577552
