# Chi-Square Analysis: Titanic Dataset
## Research Question: Did passenger class significantly affect survival rates?

**Dataset**: Titanic passenger data  
**Hypothesis**: There is an association between passenger class (1st, 2nd, 3rd) and survival  
**Test**: Chi-square test of independence  

---

## 1. Setup and Data Loading

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency, chi2
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

print("✓ Libraries loaded successfully")

In [None]:
# Load Titanic dataset
df = pd.read_csv('titanic.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

## 2. Exploratory Data Analysis

In [None]:
# Check for missing values
print("Missing values:")
print(df[['Survived', 'Pclass']].isnull().sum())
print(f"\nTotal passengers: {len(df)}")
print(f"Passengers with complete class and survival data: {df[['Survived', 'Pclass']].dropna().shape[0]}")

In [None]:
# Summary statistics
print("="*70)
print("PASSENGER CLASS DISTRIBUTION")
print("="*70)
class_dist = df['Pclass'].value_counts().sort_index()
for pclass, count in class_dist.items():
    pct = (count / len(df)) * 100
    print(f"Class {pclass}: {count:4d} passengers ({pct:5.1f}%)")

print("\n" + "="*70)
print("OVERALL SURVIVAL RATE")
print("="*70)
survival_rate = df['Survived'].mean()
print(f"Overall survival rate: {survival_rate:.1%}")
print(f"Survivors: {df['Survived'].sum():.0f}")
print(f"Non-survivors: {(len(df) - df['Survived'].sum()):.0f}")

In [None]:
# Survival rates by class
print("\n" + "="*70)
print("SURVIVAL RATES BY PASSENGER CLASS")
print("="*70)

for pclass in sorted(df['Pclass'].unique()):
    class_data = df[df['Pclass'] == pclass]
    survival_rate = class_data['Survived'].mean()
    survivors = class_data['Survived'].sum()
    total = len(class_data)
    print(f"\nClass {pclass}:")
    print(f"  Survival rate: {survival_rate:.1%}")
    print(f"  Survivors: {survivors:.0f} out of {total}")

## 3. Visual Exploration

In [None]:
# Create comprehensive visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Titanic Survival Analysis by Passenger Class', fontsize=16, fontweight='bold', y=1.00)

# 1. Count plot: Survival by class
survival_labels = {0: 'Died', 1: 'Survived'}
df_plot = df.copy()
df_plot['Survival_Status'] = df_plot['Survived'].map(survival_labels)

sns.countplot(data=df_plot, x='Pclass', hue='Survival_Status', 
              palette={'Died': 'indianred', 'Survived': 'seagreen'},
              ax=axes[0, 0])
axes[0, 0].set_title('Survival Count by Passenger Class', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Passenger Class', fontsize=11)
axes[0, 0].set_ylabel('Number of Passengers', fontsize=11)
axes[0, 0].legend(title='Status')
axes[0, 0].grid(axis='y', alpha=0.3)

# 2. Survival rate by class (bar plot)
survival_by_class = df.groupby('Pclass')['Survived'].agg(['mean', 'count'])
survival_by_class['mean'].plot(kind='bar', color=['#2E86AB', '#A23B72', '#F18F01'],
                                ax=axes[0, 1], edgecolor='black', linewidth=1.2)
axes[0, 1].set_title('Survival Rate by Passenger Class', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Passenger Class', fontsize=11)
axes[0, 1].set_ylabel('Survival Rate', fontsize=11)
axes[0, 1].set_xticklabels(['1st Class', '2nd Class', '3rd Class'], rotation=45, ha='right')
axes[0, 1].set_ylim(0, 1.0)
axes[0, 1].yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: '{:.0%}'.format(y)))
axes[0, 1].grid(axis='y', alpha=0.3)

# Add percentage labels on bars
for i, v in enumerate(survival_by_class['mean']):
    axes[0, 1].text(i, v + 0.02, f'{v:.1%}', ha='center', fontweight='bold', fontsize=10)

# 3. Stacked percentage bar chart
cross_tab = pd.crosstab(df['Pclass'], df['Survived'], normalize='index') * 100
cross_tab.plot(kind='bar', stacked=True, 
               color=['indianred', 'seagreen'],
               ax=axes[1, 0], edgecolor='black', linewidth=1.2)
axes[1, 0].set_title('Survival Distribution by Class (Percentage)', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Passenger Class', fontsize=11)
axes[1, 0].set_ylabel('Percentage', fontsize=11)
axes[1, 0].set_xticklabels(['1st Class', '2nd Class', '3rd Class'], rotation=45, ha='right')
axes[1, 0].legend(['Died', 'Survived'], title='Status')
axes[1, 0].set_ylim(0, 100)
axes[1, 0].grid(axis='y', alpha=0.3)

# 4. Summary statistics table
axes[1, 1].axis('off')
summary_data = []
for pclass in sorted(df['Pclass'].unique()):
    class_data = df[df['Pclass'] == pclass]
    total = len(class_data)
    survived = class_data['Survived'].sum()
    died = total - survived
    survival_rate = (survived / total) * 100
    summary_data.append([f'Class {pclass}', total, int(survived), int(died), f'{survival_rate:.1f}%'])

table_data = summary_data
table = axes[1, 1].table(cellText=table_data,
                         colLabels=['Class', 'Total', 'Survived', 'Died', 'Survival %'],
                         cellLoc='center',
                         loc='center',
                         colWidths=[0.15, 0.15, 0.15, 0.15, 0.15])
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 2.5)

# Style header
for i in range(5):
    table[(0, i)].set_facecolor('#4472C4')
    table[(0, i)].set_text_props(weight='bold', color='white')

# Style data rows
colors = ['#D6E9F8', '#F8E9D6', '#F8D6D6']
for i in range(1, 4):
    for j in range(5):
        table[(i, j)].set_facecolor(colors[i-1])

axes[1, 1].set_title('Summary Statistics', fontsize=12, fontweight='bold', pad=20)

plt.tight_layout()
plt.savefig('titanic_exploration.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n✓ Exploratory visualizations created")

## 4. Creating the Contingency Table

In [None]:
# Create contingency table
contingency_table = pd.crosstab(df['Pclass'], df['Survived'], 
                                 rownames=['Passenger Class'], 
                                 colnames=['Survived'])

# Rename columns for clarity
contingency_table.columns = ['Died', 'Survived']
contingency_table.index = ['1st Class', '2nd Class', '3rd Class']

print("="*70)
print("CONTINGENCY TABLE: Passenger Class vs Survival")
print("="*70)
print(contingency_table)
print("\n" + "="*70)

# Add row and column totals
contingency_with_totals = contingency_table.copy()
contingency_with_totals['Total'] = contingency_with_totals.sum(axis=1)
contingency_with_totals.loc['Total'] = contingency_with_totals.sum()

print("\nContingency Table with Totals:")
print(contingency_with_totals)

## 5. Chi-Square Test of Independence

### Hypotheses:
- **H₀ (Null Hypothesis)**: Passenger class and survival are independent (no association)
- **H₁ (Alternative Hypothesis)**: Passenger class and survival are dependent (associated)

### Assumptions:
1. Data is randomly sampled
2. Variables are categorical
3. Expected frequency in each cell ≥ 5

In [None]:
# Perform chi-square test
chi2_stat, p_value, dof, expected_freq = chi2_contingency(contingency_table)

# Calculate effect size (Cramér's V)
n = contingency_table.sum().sum()
min_dim = min(contingency_table.shape[0] - 1, contingency_table.shape[1] - 1)
cramers_v = np.sqrt(chi2_stat / (n * min_dim))

# Interpret effect size
def interpret_cramers_v(v, df):
    """Interpret Cramér's V based on degrees of freedom"""
    if df == 1:
        if v < 0.10:
            return "Negligible"
        elif v < 0.30:
            return "Small"
        elif v < 0.50:
            return "Medium"
        else:
            return "Large"
    else:  # df > 1
        if v < 0.07:
            return "Negligible"
        elif v < 0.21:
            return "Small"
        elif v < 0.35:
            return "Medium"
        else:
            return "Large"

effect_interpretation = interpret_cramers_v(cramers_v, min_dim)

print("="*70)
print("CHI-SQUARE TEST RESULTS")
print("="*70)
print(f"\nChi-square statistic (χ²): {chi2_stat:.4f}")
print(f"P-value: {p_value:.6f}")
print(f"Degrees of freedom: {dof}")
print(f"\nEffect Size (Cramér's V): {cramers_v:.4f}")
print(f"Effect size interpretation: {effect_interpretation}")
print(f"\nSignificance level (α): 0.05")
print("="*70)

# Decision
alpha = 0.05
if p_value < alpha:
    print("\n✓ REJECT THE NULL HYPOTHESIS")
    print(f"  → There IS a statistically significant association between passenger class")
    print(f"     and survival (p = {p_value:.6f} < {alpha})")
else:
    print("\n✗ FAIL TO REJECT THE NULL HYPOTHESIS")
    print(f"  → There is NO statistically significant association between passenger class")
    print(f"     and survival (p = {p_value:.6f} ≥ {alpha})")

print("\n" + "="*70)

## 6. Expected Frequencies and Assumptions Check

In [None]:
# Display expected frequencies
expected_df = pd.DataFrame(expected_freq, 
                          index=['1st Class', '2nd Class', '3rd Class'],
                          columns=['Died', 'Survived'])

print("Expected Frequencies under Independence:")
print(expected_df.round(2))

# Check assumption: all expected frequencies >= 5
min_expected = expected_freq.min()
print(f"\nMinimum expected frequency: {min_expected:.2f}")

if min_expected >= 5:
    print("✓ Assumption satisfied: All expected frequencies ≥ 5")
else:
    print("✗ WARNING: Some expected frequencies < 5. Consider Fisher's exact test.")

## 7. Standardized Residuals Analysis

Standardized residuals help identify which cells contribute most to the chi-square statistic.  
Values with |z| > 2 indicate cells that deviate significantly from independence.

In [None]:
# Calculate standardized residuals
observed = contingency_table.values
residuals = (observed - expected_freq) / np.sqrt(expected_freq)

residuals_df = pd.DataFrame(residuals,
                           index=['1st Class', '2nd Class', '3rd Class'],
                           columns=['Died', 'Survived'])

print("="*70)
print("STANDARDIZED RESIDUALS")
print("="*70)
print(residuals_df.round(3))
print("\nInterpretation:")
print("  Positive values: More than expected under independence")
print("  Negative values: Fewer than expected under independence")
print("  |z| > 2: Statistically significant contribution")
print("="*70)

# Identify significant cells
print("\nCells with significant contributions (|z| > 2):")
for i, class_name in enumerate(['1st Class', '2nd Class', '3rd Class']):
    for j, outcome in enumerate(['Died', 'Survived']):
        z_score = residuals[i, j]
        if abs(z_score) > 2:
            direction = "MORE" if z_score > 0 else "FEWER"
            print(f"  • {class_name} - {outcome}: z = {z_score:.3f}")
            print(f"    → {direction} than expected under independence")

## 8. Comprehensive Visualization of Results

In [None]:
# Create comprehensive chi-square analysis visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Chi-Square Test Results: Passenger Class vs Survival', 
             fontsize=16, fontweight='bold', y=0.995)

# 1. Observed frequencies heatmap
sns.heatmap(contingency_table, annot=True, fmt='d', cmap='YlOrRd',
           cbar_kws={'label': 'Count'}, ax=axes[0, 0],
           linewidths=2, linecolor='black')
axes[0, 0].set_title('Observed Frequencies', fontsize=12, fontweight='bold', pad=10)
axes[0, 0].set_xlabel('Survival Status', fontsize=11)
axes[0, 0].set_ylabel('Passenger Class', fontsize=11)

# 2. Expected frequencies heatmap
sns.heatmap(expected_df, annot=True, fmt='.1f', cmap='YlGnBu',
           cbar_kws={'label': 'Expected Count'}, ax=axes[0, 1],
           linewidths=2, linecolor='black')
axes[0, 1].set_title('Expected Frequencies (Under Independence)', fontsize=12, fontweight='bold', pad=10)
axes[0, 1].set_xlabel('Survival Status', fontsize=11)
axes[0, 1].set_ylabel('Passenger Class', fontsize=11)

# 3. Standardized residuals heatmap
sns.heatmap(residuals_df, annot=True, fmt='.2f', cmap='RdBu_r', center=0,
           cbar_kws={'label': 'Std. Residual'}, ax=axes[1, 0],
           linewidths=2, linecolor='black', vmin=-5, vmax=5)
axes[1, 0].set_title('Standardized Residuals (|z| > 2 is significant)', 
                    fontsize=12, fontweight='bold', pad=10)
axes[1, 0].set_xlabel('Survival Status', fontsize=11)
axes[1, 0].set_ylabel('Passenger Class', fontsize=11)

# 4. Statistical summary
axes[1, 1].axis('off')

# Create summary text
summary_text = f"""
STATISTICAL TEST RESULTS
{'='*50}

Test: Chi-Square Test of Independence

Chi-square statistic (χ²): {chi2_stat:.4f}
P-value: {p_value:.6f}
Degrees of freedom: {dof}

Effect Size (Cramér's V): {cramers_v:.4f}
Interpretation: {effect_interpretation} effect

{'='*50}
CONCLUSION (α = 0.05):

"""

if p_value < 0.05:
    conclusion = f"""✓ REJECT NULL HYPOTHESIS

There IS a statistically significant
association between passenger class and
survival on the Titanic.

Key Findings:
• 1st class passengers had significantly
  higher survival rates
• 3rd class passengers had significantly
  lower survival rates
• Social class was a major factor in
  determining survival probability
"""
else:
    conclusion = """✗ FAIL TO REJECT NULL HYPOTHESIS

No statistically significant association
found between passenger class and survival.
"""

summary_text += conclusion

axes[1, 1].text(0.1, 0.5, summary_text, 
               fontsize=10, family='monospace',
               verticalalignment='center',
               bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.3, pad=1))

plt.tight_layout()
plt.savefig('titanic_chi_square_results.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n✓ Chi-square analysis visualization created")

## 9. Post-Hoc Analysis: Pairwise Comparisons

Since we found a significant overall association, let's perform pairwise comparisons to understand which specific class differences drive the association.

In [None]:
# Pairwise chi-square tests with Bonferroni correction
from itertools import combinations

print("="*70)
print("POST-HOC PAIRWISE COMPARISONS (Bonferroni Corrected)")
print("="*70)

classes = [1, 2, 3]
pairs = list(combinations(classes, 2))
n_comparisons = len(pairs)
bonferroni_alpha = 0.05 / n_comparisons

print(f"\nNumber of comparisons: {n_comparisons}")
print(f"Bonferroni-corrected α: {bonferroni_alpha:.4f}\n")

pairwise_results = []

for class1, class2 in pairs:
    # Filter data for the two classes
    pair_data = df[df['Pclass'].isin([class1, class2])]
    pair_table = pd.crosstab(pair_data['Pclass'], pair_data['Survived'])
    
    # Perform chi-square test
    chi2, p_val, dof, exp = chi2_contingency(pair_table)
    
    # Calculate Cramér's V for this pair
    n_pair = pair_table.sum().sum()
    v_pair = np.sqrt(chi2 / n_pair)
    
    significant = "✓" if p_val < bonferroni_alpha else "✗"
    
    print(f"Class {class1} vs Class {class2}:")
    print(f"  χ² = {chi2:.4f}, p = {p_val:.6f}, V = {v_pair:.4f} {significant}")
    
    if p_val < bonferroni_alpha:
        print(f"  → Significant difference in survival rates")
    else:
        print(f"  → No significant difference")
    print()
    
    pairwise_results.append({
        'Comparison': f'Class {class1} vs {class2}',
        'Chi-square': chi2,
        'P-value': p_val,
        'Cramers V': v_pair,
        'Significant': p_val < bonferroni_alpha
    })

# Summary table
pairwise_df = pd.DataFrame(pairwise_results)
print("\nSummary Table:")
print(pairwise_df.to_string(index=False))
print("="*70)

## 10. Final Summary and Conclusions

In [None]:
print("\n" + "#"*70)
print("# FINAL SUMMARY: TITANIC CHI-SQUARE ANALYSIS")
print("#"*70)

print("\n1. RESEARCH QUESTION:")
print("   Did passenger class significantly affect survival rates on the Titanic?")

print("\n2. STATISTICAL TEST:")
print("   Chi-Square Test of Independence")

print("\n3. KEY FINDINGS:")
print(f"   • Chi-square statistic: χ² = {chi2_stat:.4f}")
print(f"   • P-value: {p_value:.6f}")
print(f"   • Effect size (Cramér's V): {cramers_v:.4f} ({effect_interpretation})")

print("\n4. SURVIVAL RATES BY CLASS:")
for pclass in sorted(df['Pclass'].unique()):
    rate = df[df['Pclass'] == pclass]['Survived'].mean()
    print(f"   • Class {pclass}: {rate:.1%}")

print("\n5. CONCLUSION:")
if p_value < 0.05:
    print("   ✓ There IS a statistically significant association between")
    print("     passenger class and survival.")
    print("   ✓ First-class passengers had significantly better survival chances.")
    print("   ✓ Third-class passengers had significantly worse survival chances.")
    print("   ✓ Social stratification played a significant role in survival outcomes.")
else:
    print("   ✗ No statistically significant association found.")

print("\n6. PRACTICAL IMPLICATIONS:")
print("   • Historical evidence of social inequality in disaster response")
print("   • Demonstrates importance of equitable safety protocols")
print("   • Highlights need for fair access to life-saving resources")

print("\n" + "#"*70)
print("# ANALYSIS COMPLETE")
print("#"*70)

---

## Key Takeaways

1. **Chi-square test confirms** that passenger class and survival were significantly associated
2. **Effect size (Cramér's V)** indicates the strength of this association
3. **Standardized residuals** reveal which specific class-survival combinations drove the association
4. **First-class passengers** had survival advantages compared to third-class passengers
5. **Historical context** shows how social stratification affected disaster outcomes

---