In [None]:
#load packages
import sys #access to system parameters https://docs.python.org/3/library/sys.html
print("Python version: {}". format(sys.version))

#import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features
import pandas as pd
print("pandas version: {}". format(pd.__version__))

import matplotlib #collection of functions for scientific and publication-ready visualization
print("matplotlib version: {}". format(matplotlib.__version__))
import plotly.express as px

#import numpy as np #foundational package for scientific computing
import numpy as np
print("NumPy version: {}". format(np.__version__))

import scipy as sp #collection of functions for scientific computing and advance mathematics
print("SciPy version: {}". format(sp.__version__)) 

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook
print("IPython version: {}". format(IPython.__version__)) 

#import sklearn #collection of machine learning algorithms
import sklearn
print("scikit-learn version: {}". format(sklearn.__version__))

#other important libararies
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots



from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split ,GridSearchCV


from lightgbm import LGBMClassifier



#misc libraries
import random
import time


#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)



# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

# Load Data Modelling Libraries

In [None]:
#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.plotting import scatter_matrix

#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

# Meet and Greet Data
This is the meet and greet step. Get to know your data by first name and learn a little bit about it. What does it look like (datatype and values), what makes it tick (independent/feature variables(s)), what's its goals in life (dependent/target variable(s)). Think of it like a first date, before you jump in and start poking it in the bedroom.

To begin this step, we first import our data. Next we use the info() and sample() function, to get a quick and dirty overview of variable datatypes (i.e. qualitative vs quantitative).Click here for the [Source data Directory](https://www.kaggle.com/c/spaceship-titanic/data)

 1. Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict. All other variables are potential predictor or independent variables. **It's important to note, more predictor variables do not make a better model, but the right variables.**
 
 2. PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
 
 3. HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
 
 4. CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
 
 5. Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
 
 6. Destination - The planet the passenger will be debarking to.
 
 7. Age - The age of the passenger.
 
 8. VIP - Whether the passenger has paid for special VIP service during the voyage.
 
 9. RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
 
 10. Name - The first and last names of the passenger.

In [None]:
data_raw = pd.read_csv("../input/spaceship-titanic/train.csv")
data_val = pd.read_csv("../input/spaceship-titanic/test.csv")
#to play with our data we'll create a copy
data1 = data_raw.copy(deep = True)
#however passing by reference is convenient, because we can clean both datasets at once
data_cleaner = [data1, data_val]
samp_subm = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')

In [None]:
print(data_raw.info())
#data_raw.tail()
data_raw.sample(10)

# Data Cleaning

In [None]:
print('Train columns with null values:\n', data1.isnull().sum())
print("-"*10)

print('Test/Validation columns with null values:\n', data_val.isnull().sum())
print("-"*10)

data_raw.describe(include = 'all')

In [None]:
drop_column = ['PassengerId','Cabin', 'Name']
###COMPLETING: complete or delete missing values in train and test/validation dataset
for dataset in data_cleaner:
    #complete missing age with median
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True)
    
    #complete HomePlanet with mode
    dataset['HomePlanet'].fillna(dataset['HomePlanet'].mode()[0], inplace = True)
    
    #complete Destination with mode
    dataset['Destination'].fillna(dataset['Destination'].mode()[0], inplace = True)
    
    #complete VIP with mode
    dataset['VIP'].fillna(dataset['VIP'].mode()[0], inplace = True)
    
    #complete CryoSleep with mode
    dataset['CryoSleep'].fillna(dataset['CryoSleep'].mode()[0], inplace = True)
    
    #complete missing RoomService with median
    dataset['RoomService'].fillna(dataset['RoomService'].median(), inplace = True)
    
    #complete missing FoodCourt with median
    dataset['FoodCourt'].fillna(dataset['FoodCourt'].median(), inplace = True)
    
    #complete missing ShoppingMall with median
    dataset['ShoppingMall'].fillna(dataset['ShoppingMall'].median(), inplace = True)
    
    #complete missing Spa with median
    dataset['Spa'].fillna(dataset['Spa'].median(), inplace = True)
    
    #complete missing VRDeck with median
    dataset['VRDeck'].fillna(dataset['VRDeck'].median(), inplace = True)
    
    #new column for all expenses
    dataset['Expenses'] = dataset['RoomService'] +dataset['FoodCourt'] + dataset['ShoppingMall'] + dataset['Spa'] + dataset['VRDeck']
    
    #droping name , cabin ,id
    dataset.drop(drop_column, axis=1, inplace = True)
    


print(data1.isnull().sum())
print("-"*10)
print(data_val.isnull().sum())

In [None]:
data1.nunique()

In [None]:
data1.value_counts()

In [None]:
data1['Destination'].value_counts()

In [None]:
data1.dtypes

# Corelation table

In [None]:
fig = px.imshow(data1.corr() ,text_auto=True, aspect="auto" , color_continuous_scale = "viridis")
fig.show()

# ENCODING

In [None]:
label_cols = ["HomePlanet", "CryoSleep", "Destination" ,"VIP"]
def label_encoder(train,test,columns):
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = LabelEncoder().fit_transform(train[col])
        test[col] =  LabelEncoder().fit_transform(test[col])
    return train, test

train ,test = label_encoder(data_cleaner[0],data_cleaner[1],label_cols)

Data Split

In [None]:
X = train.drop("Transported" , axis =1 )
y = train["Transported"]
X_train , X_val , y_train , y_val = train_test_split(X ,y, 
                                                       random_state = 12 ,
                                                       test_size =0.33)

In [None]:
param_grid = {'n_estimators': [100, 200, 400, 600],
              'learning_rate': [0.1, 0.05, 0.]}
grid = GridSearchCV(XGBClassifier(), param_grid=param_grid, cv=5, scoring='accuracy')
grid.fit(X, y)
best_params = grid.best_params_
print('Best score of cross validation: {:.2f}'.format(grid.best_score_))
print('Best parameters:', best_params)

In [None]:
model = XGBClassifier()
model.set_params(**best_params)
model.fit(X_train, y_train)

In [None]:
y_val_pred = model.predict(X_val)
print('Validation Score:', accuracy_score(y_val, y_val_pred))

In [None]:
test.head()

In [None]:
y_test_pred = model.predict(test)

ANALYZING TRAINING

In [None]:
importance = model.feature_importances_
fig = plt.figure(figsize=(10, 8))
x = X_train.columns.values
plt.barh(x, 100*importance)
plt.title('Feature Importance', loc='left')
plt.xlabel('Percentage')
plt.grid()
plt.show()

In [None]:
samp_subm['Transported'] = y_test_pred
samp_subm['Transported'].value_counts()

In [None]:
samp_subm.to_csv('submission.csv', index=False)