In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
raw_df = pd.read_csv('../input/spaceship-titanic/train.csv')
test_df = pd.read_csv('../input/spaceship-titanic/test.csv')

In [None]:
raw_df.head()

In [None]:
raw_df.info()

In [None]:
categorical_cols_df = raw_df.select_dtypes(include=['object'])
numerical_cols_df = raw_df.select_dtypes(include=['float64'])
numerical_cols = list(numerical_cols_df.columns)
categorical_cols = list(categorical_cols_df.columns)

In [None]:
temp = dict(layout=go.Layout(font=dict(family='Times New Roman', size=13), width=800))

In [None]:
categorical_cols_df.drop(['PassengerId', 'Name', 'Cabin'], axis=1, inplace=True)

In [None]:
fig = px.histogram(categorical_cols_df, marginal='box')
fig.update_layout(template = temp, xaxis_title='Features Values')
fig.show()

In [None]:
fig = go.Figure()
for col in numerical_cols[1:]:
    fig.add_trace(go.Scatter(y=numerical_cols_df[col], name=col, mode='markers'))
    fig.update_xaxes(title='Index')
    fig.update_yaxes(title='Value')
fig.update_layout(template='plotly_dark', width=1200)
fig.show()

In [None]:
fig = px.scatter(raw_df, y='Age', color='Transported')
fig.update_layout(template='plotly_dark', width=1200)
fig.show()

In [None]:
n_cols = list(numerical_cols_df.columns)
c_cols = list(categorical_cols_df.columns)

In [None]:
numerical_columns_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())
])

categorical_columns_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehotencoder', OneHotEncoder())
])

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_columns_transformer, n_cols),
    ('cat', categorical_columns_transformer, c_cols)
])

In [None]:
X_train = numerical_cols_df.join(categorical_cols_df)
Y_train = raw_df.iloc[:, -1]

In [None]:
x_train = preprocessor.fit_transform(X_train)
x_train

In [None]:
rf_model = RandomForestClassifier(random_state=42)

# Hyperparameter Tuning

In [None]:
parameters = {
    'n_estimators':[90, 100, 115, 130],
    'criterion':['gini', 'entropy'], 
    'max_depth':range(2,20,1),
    'min_samples_leaf':range(1, 10, 1),
    'min_samples_split':range(2,10,1),
    'max_features':['auto', 'log2']
}

In [None]:
clf =  RandomizedSearchCV(rf_model, parameters, cv=5)

In [None]:
clf.fit(x_train, Y_train)

In [None]:
print('Best Score : {}'.format(clf.best_score_))

In [None]:
clf.best_params_

Almost same Identical patterns as in Training Data for Categorical Columns

In [None]:
id_col = test_df.iloc[:, 0]
test_df.drop(['PassengerId','Cabin', 'Name'], axis=1, inplace=True)
test_df = preprocessor.fit_transform(test_df)
y_predict = clf.predict(test_df)
df = {'PassengerId':id_col, 'Transported':y_predict}
final_df = pd.DataFrame(df)

In [None]:
final_df.to_csv('submission.csv', index=False)

# Thank You !