# Titanic - ProfileReport EDA

As you get more experienced, the question becomes how to get the most results for minimum effort.

Introducing `from pandas_profiling import ProfileReport`
- https://pandas-profiling.github.io/pandas-profiling/docs/master/index.html

In [None]:
 %%html
<!-- Is there a better way than hard coding the height? -->
<style> 
iframe { height: 550em; }
</style>

In [None]:
import pandas as pd
from pandas_profiling import ProfileReport

pd.options.display.max_columns = 999
pd.options.display.max_rows    = 6

In [None]:
!find ../input/ -type f -name '*.csv'

# Data Preprocessing

In [None]:
# Source: https://www.kaggle.com/code/jamesmcguigan/spaceship-titanic-xgboost/
def enhance(df):
    
    for col in ['HomePlanet', 'Cabin', 'Destination', 'Name']:    
        df[col] = df[col].astype('category')        
        
    for col in ['CryoSleep', 'VIP']: 
        # df[col] = df[col].fillna(False).astype(bool)
        df[col] = df[col].astype(bool)
        
    for col in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
        df[col] = df[col].fillna(0).astype(int)            

        ### Avoid normalizing for AutoEDA
        ### Normalizing ints improves XGboost score 0.50783 -> 0.69932
        ### FillNA(mean) -> FillNA(0) reduces score 0.69932 -> 0.65583 
        # df[col] = df[col].fillna(train_df[col].mean())    # Fill NA with mean 
        # df[col] = df[col] / train_df[col].max()           # Normalize to range [0-1]
        
    # Splitting FirstName + Surname reduces score 0.69932 -> 0.50713
    df['FirstName'] = df['Name'].str.split(' ', 1).str[0].astype('category')
    df['LastName']  = df['Name'].str.split(' ', 1).str[-1].astype('category')

    # Split Cabin -> Deck/Num/Side
    df['Cabin/Deck'] = df['Cabin'].str.split('/', 2).str[0].astype('category')
    df['Cabin/Num']  = df['Cabin'].str.split('/', 2).str[1].astype('category')
    df['Cabin/Side'] = df['Cabin'].str.split('/', 2).str[2].astype('category')
    return df

# Train Dataset

In [None]:
train_df = pd.read_csv('../input/spaceship-titanic/train.csv')
train_df = enhance(train_df)
train_df

In [None]:
%%time
auto_eda = ProfileReport(train_df, title="Spaceship Titanic - Machine Learning Disaster - Train", explorative=True, minimal=False, progress_bar=False) 

# BUGFIX: https://github.com/pandas-profiling/pandas-profiling/issues/493
auto_eda.to_notebook_iframe() 

# Test Dataset

In [None]:
test_df = pd.read_csv('../input/spaceship-titanic/test.csv')
test_df = enhance(test_df)
test_df

In [None]:
%%time
auto_eda = ProfileReport(test_df, title="Spaceship Titanic - Machine Learning Disaster - Test", explorative=True, minimal=False, progress_bar=False)

# BUGFIX: https://github.com/pandas-profiling/pandas-profiling/issues/493
auto_eda.to_notebook_iframe() 