In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from collections import Counter

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, train_test_split, KFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [None]:
dataset = pd.read_csv("../input/spaceship-titanic/train.csv")

# Check for Null Data
dataset.isnull().sum()

In [None]:

dataset.corr()

In [None]:
from sklearn.preprocessing import OrdinalEncoder
s = (dataset.dtypes == 'object')
object_cols = list(s[s].index)
object_cols.remove("PassengerId")
object_cols.remove("Name")

In [None]:
ordinal_encoder = OrdinalEncoder()
dataset_copy = dataset.copy();
dataset_copy[object_cols] = ordinal_encoder.fit_transform(dataset_copy[object_cols])
print(dataset_copy.head())

In [None]:
    for x in dataset_copy.index:
        if dataset_copy.loc[x,'Transported'] == False :
            dataset_copy.loc[x,'Transported'] = 0
        else :
            dataset_copy.loc[x,'Transported'] = 1
    

In [None]:
g = sns.barplot(x="HomePlanet",y="Transported",data=dataset)

In [None]:
g = sns.barplot(x="CryoSleep",y="Transported",data=dataset)

In [None]:
g = sns.FacetGrid(dataset_copy, col='Transported')
g = g.map(sns.distplot, "Cabin")

In [None]:
g = sns.barplot(x="Destination",y="Transported",data=dataset)

In [None]:
g = sns.FacetGrid(dataset, col='Transported')
g = g.map(sns.distplot, "Age")

In [None]:
g = sns.barplot(x="VIP",y="Transported",data=dataset)

In [None]:
g = sns.FacetGrid(dataset, col='Transported')
g = g.map(sns.distplot, "VRDeck")

In [None]:
dataset_copy = dataset_copy.fillna(np.nan)
dataset_copy.fillna(dataset_copy['HomePlanet'].mode(),inplace=True)  
dataset_copy.fillna(dataset_copy['Destination'].mode(),inplace=True) 
dataset_copy.fillna(dataset_copy['RoomService'].mean(),inplace=True) 
dataset_copy.fillna(dataset_copy['Spa'].mean(),inplace=True) 
dataset_copy.fillna(dataset_copy['VRDeck'].mean(),inplace=True) 
dataset_copy.fillna(dataset_copy['CryoSleep'].mode(),inplace=True) 
dataset_copy.fillna(dataset_copy['VIP'].mode(),inplace=True)
dataset_copy.fillna(dataset_copy['Age'].mean(),inplace=True)

In [None]:
y = dataset_copy.Transported
features = ['HomePlanet', 'CryoSleep', 'VIP', 'RoomService', 'Spa','VRDeck','Destination']
X = dataset_copy[features]
X.head()
y.head(20)

In [None]:
validation_size = 0.20
seed = 7
num_folds = 10
scoring = 'accuracy'
X_train, X_validation, Y_train, Y_validation = train_test_split(X,y,test_size=validation_size,random_state=seed)

# Params for Random Forest
num_trees = 100
max_features = 3

#Spot Check 5 Algorithms (LR, LDA, KNN, CART, GNB, SVM)
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('RF', RandomForestClassifier(n_estimators=num_trees, max_features=max_features)))
#models.append(('SVM', SVC()))
# evalutate each model in turn
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
random_forest = RandomForestClassifier(n_estimators=250,max_features=5)
random_forest.fit(X, y)


In [None]:
test_data_path = '../input/spaceship-titanic/test.csv'

trainset = pd.read_csv(test_data_path)
s = (trainset.dtypes == 'object')
object_cols = list(s[s].index)
object_cols.remove("PassengerId")
object_cols.remove("Name")
ordinal_encoder = OrdinalEncoder()
trainset_copy = trainset.copy();
trainset_copy[object_cols] = ordinal_encoder.fit_transform(trainset_copy[object_cols])

trainset_copy = trainset_copy.fillna(np.nan)
trainset_copy.fillna(trainset_copy['HomePlanet'].mode(),inplace=True)  
trainset_copy.fillna(trainset_copy['RoomService'].mean(),inplace=True) 
trainset_copy.fillna(trainset_copy['Spa'].mean(),inplace=True) 
trainset_copy.fillna(trainset_copy['VRDeck'].mean(),inplace=True) 
trainset_copy.fillna(trainset_copy['CryoSleep'].mode(),inplace=True) 
trainset_copy.fillna(trainset_copy['VIP'].mode(),inplace=True)
trainset_copy.fillna(trainset_copy['Age'].mean(),inplace=True)

X = trainset_copy[features]

prediction = random_forest.predict(X)
for i in range(len(prediction)):
    if prediction[i] <=.5:
        prediction[i] = 0
    else:
        prediction[i] = 1

new_data = pd.DataFrame(trainset_copy['PassengerId'])
new_data.insert(1,'Transported',list(map(bool,prediction)))
new_data.to_csv('file1.csv', index = False)

In [None]:
print(new_data)