In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pwd

In [None]:
train_csv = pd.read_csv('../input/spaceship-titanic/train.csv')
test_csv = pd.read_csv('../input/spaceship-titanic/test.csv')

# Merge Data

In [None]:
df_csv = pd.concat([train_csv, test_csv])
df_csv.tail()

# Split Column (TODO)
- PassengerId gggg_pp
- Cabin deck/num/side
- Name first and last name

# Check null columns

In [None]:
df_csv.isnull().any()

In [None]:
df_csv.info()

# Refillna null columns
* simple refillna stradegy

In [None]:
df_csv.describe().T

In [None]:
df_csv.describe().columns

In [None]:
for col in set(df_csv.columns[1:]) - set(df_csv.describe().columns):
    print(df_csv[col].value_counts())

In [None]:
def refillna(method):
    return {
        'simple': simple_refillna(), 
    }[method]

def simple_refillna():
    df = df_csv.copy()
    for col in df_csv.describe().columns:
        df[col].fillna(df_csv[col].median(), inplace=True)
        
    df['HomePlanet'].fillna('Earth', inplace=True)
    df['CryoSleep'].fillna(False, inplace=True)
    df['Destination'].fillna('TRAPPIST-1e', inplace=True)
    df['VIP'].fillna(False, inplace=True)
    df.drop(columns=['Cabin', 'Name'], inplace=True)
    
    return df

In [None]:
df = refillna('simple')

In [None]:
df.info()

# Encode and normalize the columns

In [None]:
def normalize_and_encode():
    df_final = df.copy()
    df_final = normalize(df_final)
    df_final = encode(df_final)
    df_final.drop(columns = ['PassengerId'], inplace=True)
    return df_final

def encode(df):
    df_encode = df.copy()

    values = pd.get_dummies(df_encode['HomePlanet'])
    cols = values.columns
    df_encode[cols] = values
    df_encode.drop(columns=['HomePlanet'], inplace=True)

    values = pd.get_dummies(df_encode['Destination'])
    cols = values.columns
    df_encode[cols] = values
    df_encode.drop(columns=['Destination'], inplace=True)
    
    return df_encode

def normalize(df):
    df_normalize = df.copy()
    cols = df.describe().columns
    for col in cols:
        df_normalize[col] = (df_normalize[col] - df_normalize[col].mean()) / df_normalize[col].std()
    return df_normalize

In [None]:
df_final = normalize_and_encode()
df_final.head()

# Split Train Test data

In [None]:
def train_test_split():
    train_mask = df_final['Transported'].notnull()
    test_mask = df_final['Transported'].isnull()
    
    train = df_final[train_mask]
    X_train = train.drop(columns = ['Transported'])
    y_train  = train['Transported'].astype('bool')
    
    test = df_final[test_mask]
    X_test = test.drop(columns = ['Transported'])
    
    return X_train, y_train, X_test

In [None]:
def get_hyp_params(random_state=0):
    return {
        'random_state': 0, 
    }

In [None]:
from sklearn.linear_model import LogisticRegression

X_train, y_train, X_test = train_test_split()
hyp_params = get_hyp_params(0)
clf = LogisticRegression(random_state=hyp_params['random_state']).fit(X_train.values, y_train.values)
y_test = clf.predict(X_test)

In [None]:
test_csv['Transported'] = pd.DataFrame({'Transported': y_test})
test_csv.head()

In [None]:
result = test_csv[['PassengerId', 'Transported']]
result.head()

In [None]:
result.to_csv('result.csv', index=False)