# File and Data Field Descriptions
* train.csv - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.
    * PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
    * HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
    * CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
    * Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
    * Destination - The planet the passenger will be debarking to.
    * Age - The age of the passenger.
    * VIP - Whether the passenger has paid for special VIP service during the voyage.
    * RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
    * Name - The first and last names of the passenger.
    * Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

* test.csv - Personal records for the remaining one-third (~4300) of the passengers, to be used as test data. Your task is to predict the value of Transported for the passengers in this set.
* sample_submission.csv - A submission file in the correct format.
    * PassengerId - Id for each passenger in the test set.
    * Transported - The target. For each passenger, predict either True or False.

# import Lib

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
df_train = pd.read_csv('../input/spaceship-titanic/train.csv')
df_test = pd.read_csv('../input/spaceship-titanic/test.csv')

In [None]:
df_train.head()

In [None]:
df_train.isna().sum()

In [None]:
df_test.isna().sum()

In [None]:
transported_count = df_train['Transported'].value_counts()
fig, ax = plt.subplots(figsize=(5,5))
bars = ax.barh(transported_count.index.astype(str), transported_count.tolist())

ax.bar_label(bars)

In [None]:
df_train.corr()

In [None]:
fig = px.imshow(df_train.corr(), text_auto=True)
fig.show()

In [None]:
del df_train['Name'] 
del df_train['PassengerId'] 
del df_train['Cabin']
del df_test['Name'] 
del df_test['PassengerId'] 
del df_test['Cabin']

In [None]:
df_train.info()

##  HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.

In [None]:
df = px.data.tips()
fig = px.histogram(df_train, x="HomePlanet")
fig.show()

In [None]:
fig = px.histogram(df_test, x="HomePlanet")
fig.show()

## CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.

In [None]:
fig = px.histogram(df_train, x="CryoSleep")
fig.show()

In [None]:
fig = px.histogram(df_test, x="CryoSleep")
fig.show()

## Destination - The planet the passenger will be debarking to.

In [None]:
fig = px.histogram(df_train, x="Destination")
fig.show()

In [None]:
fig = px.histogram(df_test, x="Destination")
fig.show()

## VIP - Whether the passenger has paid for special VIP service during the voyage.

In [None]:
fig = px.histogram(df_train, x="VIP")
fig.show()

In [None]:
fig = px.histogram(df_test, x="VIP")
fig.show()

## Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.


In [None]:
fig = px.histogram(df_train, x="Transported")
fig.show()

# processing

In [None]:
def ProcessNum(df):
    num_data = df.select_dtypes(['float64']) 
    num_col = list(num_data.columns)
    dict_num = {i:num_col[i] for i in range(len(num_col)) }
    num_data.head()
    imputer = SimpleImputer(strategy='mean')
    d=imputer.fit_transform(num_data)
    temp=pd.DataFrame(d)
    temp = temp.rename(columns=dict_num) 
    return temp

def ProcessObj(df):
    obj_data = df.select_dtypes(['object']) 
    obj_col = list(obj_data.columns)
    for col in list(obj_data.columns):
        obj_data[col] = obj_data[col].fillna(obj_data[col].mode()[0])
        pass
    z = obj_data.columns
    for i in z:
        un = obj_data[i].unique()
        ran = range(1,len(un)+1)
        obj_data.replace(dict(zip(un,ran)) ,inplace=True)
    obj_data = pd.get_dummies(obj_data, columns=['HomePlanet', 'Destination'], prefix = ['HomePlanet', 'Destination'])
    return obj_data

def ProcessBool(df):
    
    bool_data = df.select_dtypes(['bool'])
    col = bool_data.columns
    for i in col:
        bool_data[i] = LabelEncoder().fit_transform(bool_data[i])
    return bool_data
    

test_num = ProcessNum(df_test)
train_num = ProcessNum(df_train)
test_cat = ProcessObj(df_test)
train_cat = ProcessObj(df_train)
train_bool = ProcessBool(df_train)
train_data_process = pd.concat([train_num,train_cat,train_bool],axis=1)
test_data_process = pd.concat([test_num,test_cat],axis=1)

In [None]:
train_data_process.sample(2)

In [None]:
train_data_process.info()

In [None]:
train_data_process.isna().sum()

In [None]:
test_data_process.sample(2)

In [None]:
test_data_process.info()

In [None]:
test_data_process.isna().sum()

# Splitting data

In [None]:
features = np.array(train_data_process.drop('Transported', axis=1))
targets = np.array(train_data_process['Transported'])

In [None]:
x_train, x_val, y_train, y_val = train_test_split(features, targets, test_size=0.2, random_state=0)

# Select Model

In [None]:
rf = RandomForestClassifier()
RF_grid = {'n_estimators': [50, 100, 150, 200], 'max_depth': [4, 6, 8, 10, 12]}
grid = GridSearchCV(rf, RF_grid, cv=10, scoring='accuracy', return_train_score=False,verbose=1)
grid_search = grid.fit(x_train, y_train)
accuracy = grid_search.best_score_ *100
print(accuracy)

In [None]:
rf = RandomForestClassifier(max_depth = 10, n_estimators = 200)
rf.fit(x_train,y_train)
y_pred = rf.predict(x_val)
print("Accuracy: "+str(accuracy_score(y_val,y_pred))) 

In [None]:
submission = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
features_test = np.array(test_data_process)
y_test_pred = rf.predict(features_test)

In [None]:
submission['Transported'] = y_test_pred.astype(bool)
submission.to_csv('./submission.csv', index=False)