In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Data

In [None]:
train=pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test=pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

In [None]:
train

In [None]:
test

# Data Processing

In [None]:
import pandas as pd
import numpy as np

In [None]:
#join data to simplify and universalize processing 
train_y=train['Transported']
train_X=train.drop('Transported',axis=1)
joint=pd.concat([train_X,test],axis=0)

In [None]:
joint.info()

## Quantative data

In [None]:
quantative=['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa','VRDeck']
joint[quantative].isna().sum()

In [None]:
# fill missing data with mean value
for i in quantative:
    joint[i].fillna(joint[i].mean(),inplace=True)
    joint[i]=joint[i].astype('int64')

### Groupping age

In [None]:
joint['Age'].describe()

In [None]:
def age_group(x):
    if x>=0 and x<10: return 1
    if x>=10 and x<20: return 2
    if x>=20 and x<30: return 3
    if x>=30 and x<40: return 4
    if x>=40 and x<50: return 5
    if x>=50 and x<60: return 6
    if x>=60 and x<70: return 7
    if x>=70: return 8

joint['AgeGroup']=joint['Age'].apply(age_group)
joint['AgeGroup'].value_counts()

## Category data

In [None]:
category=['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
# Cabin and Name will be processed separatelly as far as they contain composed info

In [None]:
# fill missing data with mode value
for i in category:
    joint[i].fillna(joint[i].mode()[0],inplace=True)
    joint[i]=pd.factorize(joint[i])[0]

### Decompose Cabin data

In [None]:
# Cabibn as a raw data indicates uniq place - almost as Passenger ID.
# But each element of Cabin data has it's meaning.

In [None]:
def def_cabin(i):
    if i is np.nan: result=[np.nan]*3 
    else: result=i.split('/')
    return pd.Series(result,index=['Deck','Num','Side'])

joint[['Deck','Num','Side']]=joint['Cabin'].apply(def_cabin)

for i in ['Deck','Num','Side']:
    joint[i].fillna(joint[i].mode()[0],inplace=True)
    joint[i]=pd.factorize(joint[i])[0]


In [None]:
joint[['Deck','Num','Side']]

### Decompose Name

In [None]:
# Name as a raw data indicates uniq person - almost as Passenger ID.
# But we can use assumption that last name can indicate family membership.
# First name can also have meaning.

In [None]:
def def_name(i):
    if i is np.nan: result=[np.nan]*2
    else: result=i.split(' ')
    return pd.Series(result,index=['FirstName','LastName'])

joint[['FirstName','LastName']]=joint['Name'].apply(def_name)

for i in ['FirstName','LastName']:
    joint[i].fillna(joint[i].mode()[0],inplace=True)
    joint[i]=pd.factorize(joint[i])[0]

# Drop reduntant data

In [None]:
joint.head()

In [None]:
joint.info()

In [None]:
joint.isna().sum()

In [None]:
joint_X=joint.drop(['PassengerId','Cabin', 'Name'],axis=1)
train_X=joint_X[:8693]
test_X=joint_X[8693:]

# Models application

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

## Decision Tree cross validation

In [None]:
tree = DecisionTreeClassifier()
tree_params = {'max_depth': range(1,7), 'max_features': range(5,10)}
tree_grid = GridSearchCV(tree, tree_params, cv=5, n_jobs=-1, verbose=True)
tree_grid.fit(train_X, train_y)
tree_grid.best_params_, tree_grid.best_score_

## kNN cross validation

In [None]:
knn = KNeighborsClassifier()
knn_pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_jobs=-1))])
knn_params = {'knn__n_neighbors': range(1, 40)}
knn_grid = GridSearchCV(knn_pipe, knn_params, cv=5, n_jobs=-1,verbose=True)
knn_grid.fit(train_X, train_y)
knn_grid.best_params_, knn_grid.best_score_

## Random Forest cross validation

In [None]:
forest = RandomForestClassifier()
forest_params = {'max_depth': range(1,10), 'max_features': range(1,10)}
forest_grid = GridSearchCV(forest, forest_params, cv=5, n_jobs=-1, verbose=False)
forest_grid.fit(train_X, train_y)
forest_grid.best_params_, forest_grid.best_score_

# Submission

In [None]:
# Random Forest with {'max_depth': 9, 'max_features': 4} has the best accuracy so far
forest = RandomForestClassifier(max_depth=9,max_features=4)
forest.fit(train_X, train_y)
test_pred=forest.predict(test_X)

In [None]:
sub = pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')
sub['Transported']=test_pred
sub.to_csv('submission.csv',index=False)