In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_regression
import matplotlib.pyplot as plt
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

print("Libraries imported!")

In [None]:
# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# Utility functions from Tutorial
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

In [None]:
class CustomTransformer(TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        X_["Custom"] = X_["CryoSleep"] * X_["Age"] + X_["VIP"] * X_["CryoSleep"] * X_["Age"]
        return X_

In [None]:
df = pd.read_csv("../input/spaceship-titanic/train.csv")
df.head()

In [None]:
for x in ['Destination', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'HomePlanet', 'Cabin']:
    df.pop(x)
for x in ['CryoSleep', 'VIP', 'Transported']:
    df[x] = df[x].replace({False:0,True:1}) 

y = df.pop('Transported')

df.head()

In [None]:
# Which columns contain missing values? 
print(df.isnull().sum())

In [None]:
df.head()

In [None]:
# Impute Age

numbers_imputer = SimpleImputer(strategy='mean')
for x in ['PassengerId', 'CryoSleep', 'Age', 'VIP']:
     df[x] = numbers_imputer.fit_transform(df[[x]])

In [None]:
print(df.isnull().sum())
df.head()

In [None]:
mi_scores = mutual_info_regression(df, y, random_state=0)

mi_scores = pd.Series(mi_scores, name="MI Scores", index=df.columns)

mi_scores = mi_scores.sort_values(ascending=False)

plot_mi_scores(mi_scores)


In [None]:
df.head()

In [None]:
y.head()

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('transform', SimpleImputer(), ['CryoSleep', 'VIP']),
    ])
model = XGBClassifier(n_estimators=50, random_state=0)

custom_pipeline = Pipeline([("customfeature", CustomTransformer())])
custom_pipeline.fit(df)
custom_pipeline.transform(df)

pipeline = Pipeline(steps=[
    ('custom', custom_pipeline),
    ('preprocessor', preprocessor),
    ('model', model)])

scores = -1 * cross_val_score(pipeline, df, y, cv=5,
                              scoring='neg_mean_absolute_error')

print("Average MAE score:", scores.mean())