# Solution for SpaceShip-Titanic Comp
@ Author: Abdelrahman Hanafy <br>
@ Date: Mon, June 27

## Check Also
- This is my code for the orignal version of the problem: https://www.kaggle.com/code/abdelrahmanhanafy/titanic

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Explore

In [None]:
train_df = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
out_sample = pd.read_csv("/kaggle/input/spaceship-titanic/sample_submission.csv")


In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.info()
print("-"*40)
test_df.info()

In [None]:
print(train_df.isna().sum())
print("-"*40)
print(test_df.isna().sum())


In [None]:
train_df.describe(include=np.number)

In [None]:
train_df.describe(include='O')

# Wranggling

In [None]:
from sklearn.impute import SimpleImputer

imputer_cols = ["Age", "FoodCourt", "ShoppingMall", "Spa", "VRDeck" ,"RoomService"]
# Create our imputer to replace missing values 
imp = SimpleImputer(strategy='median')
imp = imp.fit(train_df[imputer_cols])

train_df[imputer_cols] = imp.transform(train_df[imputer_cols])
test_df[imputer_cols] = imp.transform(test_df[imputer_cols])
train_df["HomePlanet"].fillna('Z', inplace=True)
test_df["HomePlanet"].fillna('Z', inplace=True)

In [None]:
print(train_df.isna().sum())
print("-"*40)
print(test_df.isna().sum())


In [None]:
train_df['AgeRange'] = pd.cut(train_df.Age,[0,5,18,40,65,90],labels=["Chlid","Teen","Adult","Middle","old"], include_lowest=True)
test_df['AgeRange'] = pd.cut(test_df.Age,[0,5,18,40,65,90],labels=["Chlid","Teen","Adult","Middle","old"], include_lowest=True)

train_df.info()

In [None]:
train_df["Extra"] = train_df.RoomService+train_df.FoodCourt+train_df.ShoppingMall+\
                    train_df.Spa+train_df.VRDeck
test_df["Extra"] = test_df.RoomService+test_df.FoodCourt+test_df.ShoppingMall+\
                    test_df.Spa+test_df.VRDeck

In [None]:
Extra = pd.cut(train_df.Extra, 3)
Extra.value_counts()

In [None]:
cabin_df = pd.DataFrame()
cabin_df[["deck","num","side"]]=train_df.Cabin.str.split("/",expand=True)
train_df = pd.concat([train_df, cabin_df.reset_index(drop=True)], axis=1)
cabin_df = pd.DataFrame()
cabin_df[["deck","num","side"]]=test_df.Cabin.str.split("/",expand=True)
test_df = pd.concat([test_df, cabin_df.reset_index(drop=True)], axis=1)

In [None]:
train_df.head()

In [None]:
train_df.info()

# EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")


In [None]:
sns.countplot(data=train_df,x="Transported")

In [None]:
sns.boxenplot(data=train_df,x="Age")

In [None]:
# Compute the correlation matrix
corr = train_df.corr()

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, vmax=.3, center=0,
            square=True, linewidths=.5)

In [None]:
f, axs = plt.subplots(1,2,figsize=(11, 9))
train_df.HomePlanet.value_counts().plot.pie(autopct='%.0f%%',ax = axs[0])
train_df.Destination.value_counts().plot.pie(autopct='%.0f%%',ax = axs[1])

In [None]:
f, axs = plt.subplots(2,1,figsize=(11, 9))
sns.countplot(data=train_df,x="HomePlanet",hue="Transported",ax=axs[0])
sns.countplot(data=train_df,x="Destination",hue="Transported",ax=axs[1])

In [None]:
sns.scatterplot(data=train_df,x="Extra",y="Age",hue="Transported")

In [None]:
f, axs = plt.subplots(2,1,figsize=(11, 9))
sns.countplot(data=train_df,x="deck",hue="Transported",ax=axs[0])
sns.countplot(data=train_df,x="side",hue="Transported",ax=axs[1])

In [None]:
sns.catplot(x="VIP", y="Extra", data=train_df)

In [None]:
f, axs = plt.subplots(2,1,figsize=(11, 9))
sns.countplot(data=train_df,x="AgeRange",hue="Transported",ax=axs[0])
sns.countplot(data=train_df,x="CryoSleep",hue="Transported",ax=axs[1])

In [None]:
train = train_df.drop(["PassengerId","Name" ,"Cabin","num"] , axis = 1)

In [None]:
print(train.isna().sum())

In [None]:
f, axs = plt.subplots(2,2,figsize=(11, 9))
sns.countplot(data=train_df,x="HomePlanet",hue="CryoSleep",ax=axs[0][0])
sns.countplot(data=train_df,x="Destination",hue="CryoSleep",ax=axs[0][1])
sns.countplot(data=train_df,x="VIP",hue="CryoSleep",ax=axs[1][0])
sns.countplot(data=train_df,x="AgeRange",hue="CryoSleep",ax=axs[1][1])


In [None]:
f, axs = plt.subplots(2,2,figsize=(11, 9))
sns.countplot(data=train_df,x="HomePlanet",hue="Destination",ax=axs[0][0])
sns.countplot(data=train_df,x="CryoSleep",hue="Destination",ax=axs[0][1])
sns.countplot(data=train_df,x="VIP",hue="Destination",ax=axs[1][0])
sns.countplot(data=train_df,x="AgeRange",hue="Destination",ax=axs[1][1])


# ML models

In [None]:
#importing Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

### Models that handle NaN values 

#### encoding

In [None]:
train_df.info()

In [None]:
label_cols = ["HomePlanet", "CryoSleep","deck","side","Destination" ,"AgeRange"]
def label_encoder(train,test,columns):
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = LabelEncoder().fit_transform(train[col])
        test[col] =  LabelEncoder().fit_transform(test[col])
    return train, test

train ,test = label_encoder(train_df,test_df ,label_cols)

In [None]:
train.drop(["PassengerId","Cabin" ,"VIP","Name","num"] , axis = 1,inplace=True)
test.drop(["PassengerId","Cabin" ,"VIP","Name","num"] , axis = 1,inplace=True)

In [None]:
train.info()

In [None]:
train_df.info()

In [None]:
Traget = "Transported"

X = train.drop(Traget , axis =1 )
y = train[Traget]

X_train , X_test , y_train , y_test = train_test_split(X , y, random_state = 12 ,test_size =0.33)

scaler = StandardScaler()
scaler.fit(X_train)
X_std_train = scaler.transform(X_train)
X_std_test = scaler.transform(X_test)



#### Confusion Matrix function

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,plot_confusion_matrix,ConfusionMatrixDisplay

def matrix_it(model,X_train, X_test, y_train, y_test):
    print(model.score(X_train,y_train))
    print(model.score(X_test, y_test))
    pred_ = model.predict(X_test)
    #creating confusion matrix to know the errors
    conf = confusion_matrix(y_test, pred_ ,normalize="all")
    disp = ConfusionMatrixDisplay(conf).plot(cmap=plt.cm.PuBuGn)
    

## XGBClassifier	

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(gamma = 1.5,
                           subsample = 1.0,
                           max_depth = 5,
                           colsample_bytree = 1.0,
                           n_estimators = 100)

In [None]:
xgb_model.fit(X_std_train,y_train)
matrix_it(xgb_model,X_std_train , X_std_test , y_train , y_test)

In [None]:
xgb_model.fit(X_train,y_train)
matrix_it(xgb_model,X_train , X_test , y_train , y_test)

In [None]:
plot_confusion_matrix(xgb_model,
                      X_std_test,
                      y_test,
                      cmap = "summer",
                      normalize = "true");

#### Notes about Features
- Appered to influnce the target but need to work on the NaN Values
    - CryoSleep
    - HomePlanet
    - deck & side of the cabin
    - Destination 

In [None]:
y = train_df["Transported"]
features = ["CryoSleep","HomePlanet",'deck','side','Destination']
X = pd.get_dummies(train_df[features])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Suport Vector

In [None]:
svc = SVC()
svc.fit(X_train,y_train)
matrix_it(svc,X_train, X_test, y_train, y_test)

In [None]:
svc.fit(X_std_train,y_train)
matrix_it(svc,X_std_train , X_std_test , y_train , y_test)

## Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=250, max_depth=7, random_state=1)
rf.fit(X_train,y_train)

matrix_it(rf,X_train, X_test, y_train, y_test)

In [None]:
rf.fit(X_std_train,y_train)
matrix_it(rf,X_std_train , X_std_test , y_train , y_test)

## Logistic Regression

In [None]:
lr = LogisticRegression(random_state=42, C=10)
lr.fit(X_train,y_train)

matrix_it(lr,X_train, X_test, y_train, y_test)

In [None]:
lr.fit(X_std_train,y_train)
matrix_it(lr,X_std_train , X_std_test , y_train , y_test)

# OUTPUT

In [None]:
out_sample.info()

In [None]:
X_out = test
X_out.info()

In [None]:
xgb_model.fit(X,y)
pred_ = pd.Series(xgb_model.predict(X_out)).map({0:False, 1:True})
len(pred_)

In [None]:
out_sample["Transported"] = pred_

In [None]:
out_sample.describe()

In [None]:
out_sample.info()

In [None]:
out_sample.head()

In [None]:
out_sample.to_csv("submission_xgb_final.csv",index=False)