In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')

In [None]:
df_train = pd.read_csv("../input/spaceship-titanic/train.csv")
df_test = pd.read_csv("../input/spaceship-titanic/test.csv")
ss = pd.read_csv("../input/spaceship-titanic/sample_submission.csv")

combine = [df_train,df_test]

In [None]:
df_train.shape,df_test.shape

In [None]:
df_train.columns

In [None]:
df_train.head()

In [None]:
same_cabin_df = df_train.loc[df_train.duplicated(subset = ['Cabin'])]
#Plenty of people shared the same cabin
same_cabin_df.shape

In [None]:
df_train.info()
df_test.info()
#Missing values are present in all features

In [None]:
df_train.describe()

In [None]:
df_train.describe(include=['O'])

In [None]:
#Looking for people in duplicated Cabins
df_train.query("Cabin == 'G/734/S'")

In [None]:
#df_train['Expenses'] = df_train['RoomService']+df_train['FoodCourt']+df_train['ShoppingMall'] \
#+df_train['Spa'] + df_train['VRDeck']
#df_test['Expenses'] = df_test['RoomService']+df_test['FoodCourt']+df_test['ShoppingMall'] \
#+df_test['Spa'] + df_test['VRDeck']

In [None]:
#PassengerID's are unique across the dataset, first half of ID details the group(maybe helpful) 
#Earth is the most common HomePlanet
#Several people shared a cabin(almost 1/4th)
#Most people were not VIPs

**Correcting**: 
Name might be dropped since it does not relate to transportation.

**Creating**:
Age bands, feature containing all expenses in one.  

**Classifying**:
Maybe low age and VIP were more likely to be transported. 


In [None]:
#Pivoting features: CryoSleep, VIP, Destination, HomePlanet
df_train[['CryoSleep','Transported']].groupby(['CryoSleep'],as_index=False).mean()\
.sort_values(by='Transported',ascending=False)
#CryoSleep people got transported more 

In [None]:
df_train[['VIP','Transported']].groupby(['VIP'],as_index=False).mean()\
.sort_values(by='Transported',ascending=False)

In [None]:
df_train[['Destination','Transported']].groupby(['Destination'],as_index=False).mean()\
.sort_values(by='Transported',ascending=False)

In [None]:
df_train[['HomePlanet','Transported']].groupby(['HomePlanet'],as_index=False).mean()\
.sort_values(by='Transported',ascending=False)

In [None]:
#Correlation b/w numerical features with target feature
g = sns.FacetGrid(df_train,col = "Transported")
g.map(plt.hist, 'Age', bins = 20)

In [None]:
#g = sns.FacetGrid(df_train, col = 'Transported')
#g.map(plt.hist, 'Expenses', bins = 20)

In [None]:
#Correlating ordinal and numerical features
grid = sns.FacetGrid(df_train, col = 'VIP', hue = 'Transported')
grid.map(plt.hist, "Age", alpha = 0.5, bins = 20)
grid.add_legend();
#Very few VIPs to differentiate on this basis

In [None]:
grid1 = sns.FacetGrid(df_train, hue = 'Transported', col = 'CryoSleep')
grid1.map(plt.hist, 'Age', alpha = 0.5, bins = 20)
grid1.add_legend();
#Most of the people who took CryoSleep were succesfully transported

In [None]:
df_train['HomePlanet'].value_counts()

In [None]:
#Correlating categorical and numerical features
grid2 = sns.FacetGrid(df_train, row = 'HomePlanet', col = 'Transported', size = 2.2, aspect = 1.6)
grid2.map(sns.barplot,'VIP','Age', alpha = .5, ci = None)
grid2.add_legend();
#People who have been transported have not been particularly expansive 

In [None]:
#Counts of missing values per observation
ncounts = pd.DataFrame([df_train.isna().mean(),df_test.isna().mean()]).T
ncounts = ncounts.rename(columns = {0: "train_missing",1: "test_missing"})

ncounts.query("train_missing > 0").plot(kind = "barh",
                                       figsize = (8,5),
                                       title = "% of missing values")
plt.show()
#Expenses has lots of missing values, maybe drop expenses?

In [None]:
df_train.isnull().sum()

In [None]:
#Probably should have done preprocessing before doing the EDA 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
impute_cols = ["Age","FoodCourt","ShoppingMall","VRDeck","Spa","RoomService"]
Simp = SimpleImputer()
df_train[impute_cols] = Simp.fit_transform(df_train[impute_cols])
df_test[impute_cols] = Simp.transform(df_test[impute_cols])
df_train["HomePlanet"].fillna("O", inplace = True)
df_test["HomePlanet"].fillna("O", inplace = True)

In [None]:
label_cols = ["HomePlanet","Destination","VIP","CryoSleep","Cabin"]
def label_enc(train,test,columns):
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = LabelEncoder().fit_transform(train[col])
        test[col] = LabelEncoder().fit_transform(test[col])
        
    return train,test
df_train,df_test = label_enc(df_train,df_test,label_cols)

In [None]:
X_train = df_train.drop(["Transported","Name"],axis = 1)
y_train = df_train["Transported"]
X_test = df_test.drop(["Name"], axis = 1)

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
X_train["PassengerId"] = le.fit_transform(X_train["PassengerId"])
X_test["PassengerId"] = le.fit_transform(X_test["PassengerId"])

In [None]:
X_train.dtypes

In [None]:
import xgboost as xgb
model = xgb.XGBClassifier()
model.fit(X_train,y_train)

preds = model.predict(X_test)

In [None]:
preds

In [None]:
X_test["PassengerId"] = le.inverse_transform(X_test["PassengerId"])

In [None]:
submission = pd.DataFrame({'PassengerId': X_test["PassengerId"],
                          'Transported': preds}, columns = ["PassengerId","Transported"])

In [None]:
submission.to_csv("submission.csv", index = False)