In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.impute import KNNImputer
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

In [None]:
def find_missing_percent(data):
    """
    Returns dataframe containing the total missing values and percentage of total
    missing values of a column.
    """
    miss_df = pd.DataFrame({'ColumnName':[],'TotalMissingVals':[],'PercentMissing':[]})
    for col in data.columns:
        sum_miss_val = data[col].isnull().sum()
        percent_miss_val = round((sum_miss_val/data.shape[0])*100,2)
        miss_df = miss_df.append(dict(zip(miss_df.columns,[col,sum_miss_val,percent_miss_val])),ignore_index=True)
    return miss_df

miss_df = find_missing_percent(train_data)
'''Columns with missing values'''
print(f"Number of columns with missing values: {str(miss_df[miss_df['PercentMissing']>0.0].shape[0])}")
display(miss_df[miss_df['PercentMissing']>0.0])


In [None]:
def nullval(data):
    data["RoomService"] = data["RoomService"].fillna(0)
    data["FoodCourt"] = data["FoodCourt"].fillna(0)
    data["ShoppingMall"] = data["ShoppingMall"].fillna(0)
    data["Spa"] = data["Spa"].fillna(0)
    data["VRDeck"] = data["VRDeck"].fillna(0)
    
    data["VIP"] = data["VIP"].fillna("Unknown")
    data["CryoSleep"] = data["CryoSleep"].fillna("Unknown")
    
    data["Name"] = data["Name"].fillna("Unknown")
    data["HomePlanet"] = data["HomePlanet"].fillna("Unknown")
    data["Destination"] = data["Destination"].fillna("Unknown")
    
    data["Cabin"] = data["Cabin"].fillna('U//U')
    data["Age"] = data["Age"].fillna(np.NaN)
    #data["CryoSleep"] = data["CryoSleep"].fillna("Unknown")
    
    return data

In [None]:
train_data = nullval(train_data)

In [None]:
miss_df = find_missing_percent(train_data)
'''Columns with missing values'''
print(f"Number of columns with missing values: {str(miss_df[miss_df['PercentMissing']>0.0].shape[0])}")
display(miss_df[miss_df['PercentMissing']>0.0])

In [None]:
train_data.info()

In [None]:
name_encoder = LabelEncoder()
home_encoder = LabelEncoder()
dest_encoder = LabelEncoder()
cryo_encoder = LabelEncoder()
vip_encoder = LabelEncoder()
deck_encoder = LabelEncoder()
side_encoder = LabelEncoder()

def convert(data):
    # passenger id
    group,number = [],[]
    for dp in data["PassengerId"]:
        i,j = map(int,dp.split("_"))
        group.append(i)
        number.append(j)
    data["passenger_group"] = group
    data["passenger_number"] = number
    del data["PassengerId"]
        
    # passenger cabin
    deck,deck_num,deck_side = [],[],[]
    for dp in data.Cabin:
        try:
            i,j,k = dp.split('/')
            deck.append(i)
            deck_side.append(k)
            deck_num.append(int(j))
        except:
            deck_num.append(np.NaN)
    data["cabin_deck"] = deck
    data["cabin_num"] = deck_num
    data["cabin_side"] = deck_side
    del data["Cabin"]
    
    data["CryoSleep"] = data["CryoSleep"].replace({True: 'TRUE', False: 'FALSE'})
    data["VIP"] = data["VIP"].replace({True: 'TRUE', False: 'FALSE'})
    
    # encoders
    data.Name = name_encoder.fit_transform(data.Name)
    data.HomePlanet = home_encoder.fit_transform(data.HomePlanet)
    data.CryoSleep = cryo_encoder.fit_transform(data.CryoSleep)
    data.VIP = vip_encoder.fit_transform(data.VIP)
    data.Destination = dest_encoder.fit_transform(data.Destination)
    data.cabin_deck = deck_encoder.fit_transform(data.cabin_deck)
    data.cabin_side = side_encoder.fit_transform(data.cabin_side)
    
    data["amount"] = data["RoomService"]+data["FoodCourt"]+data["ShoppingMall"]+data["Spa"]+data["VRDeck"]
    del data["RoomService"],data["FoodCourt"],data["ShoppingMall"],data["Spa"],data["VRDeck"]
    
    return data

In [None]:
train_data = convert(train_data)

In [None]:
train_data.info()

In [None]:
miss_df = find_missing_percent(train_data)
'''Columns with missing values'''
print(f"Number of columns with missing values: {str(miss_df[miss_df['PercentMissing']>0.0].shape[0])}")
display(miss_df[miss_df['PercentMissing']>0.0])

In [None]:
X = train_data.drop(["Transported"],axis=1)
y = train_data["Transported"]
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [None]:
def imputenull(data):
    column = data.columns
    age_imputer = KNNImputer(n_neighbors=1)
    data = age_imputer.fit_transform(data)
    data = pd.DataFrame(data, columns=column)
    return data

In [None]:
x_train = imputenull(x_train)
x_test = imputenull(x_test)

In [None]:
X.info()

In [None]:
x_train.info()

In [None]:
c = train_data.corr()
plt.figure(figsize = (70,30))
plt.rcParams.update({'font.size': 35})
sns.heatmap(c,annot=True,annot_kws={"size": 35})
plt.show()

In [None]:
features = list(c[c["Transported"]<0]["Transported"].index)

In [None]:
def FeatureSelected(x,y):
    bestfeatures = SelectKBest(score_func=chi2, k=10)
    fit = bestfeatures.fit(x,y)
    dfscores = pd.DataFrame(fit.scores_)
    pd.options.display.float_format = '{:.4f}'.format
    dfcolumns = pd.DataFrame(x.columns)
    featureScores = pd.concat([dfcolumns,dfscores],axis = 1)
    featureScores.columns = ["Specs","Score"]
    featureScores.sort_values("Score",ascending=False,inplace=True)
    return featureScores

In [None]:
print(FeatureSelected(x_train,y_train))

In [None]:
x_train = x_train[features]
x_test = x_test[features]

In [None]:
x_train.info()

In [None]:
x_test.info()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.naive_bayes import BernoulliNB 
from sklearn import svm

from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
def evaluation(y_test,y_pred):
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
accuracy_dict = {}
models = {}

In [None]:
from sklearn.naive_bayes import BernoulliNB 
naive = BernoulliNB()
naive.fit(x_train, y_train)
y_pred = naive.predict(x_test)

print(accuracy_score(y_test,y_pred))
accuracy_dict["Navier Bayes"] = accuracy_score(y_test,y_pred)
models["Navier Bayes"] = naive

In [None]:
from sklearn.neighbors import KNeighborsClassifier
k=KNeighborsClassifier()
k.fit(x_train,y_train)
result=k.predict(x_test)

print(accuracy_score(y_test,result))
accuracy_dict["KNN"] = accuracy_score(y_test,result)
models["KNN"] = k

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(x_train)
X_test = sc.transform(x_test)
Y_train = y_train
Y_test = y_test
lr=LogisticRegression()
lr.fit(X_train,Y_train)
Y_pred=lr.predict(X_test)

print(accuracy_score(Y_test,Y_pred))
accuracy_dict["Logistic"] = accuracy_score(Y_test,Y_pred)
models["Logistic"] = lr

In [None]:
from sklearn.tree import DecisionTreeClassifier
d=DecisionTreeClassifier()
d.fit(x_train,y_train)
y_pred=d.predict(x_test)

print(accuracy_score(y_test,y_pred))
accuracy_dict["Decision Tree"] = accuracy_score(y_test,y_pred)
models["Decision Tree"] = d

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(x_train,y_train)
y_pred=rf.predict(x_test)

print(accuracy_score(y_test,y_pred))
accuracy_dict["Random Forest"] = accuracy_score(y_test,y_pred)
models["Random Forest"] = rf

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc=GradientBoostingClassifier()
gbc.fit(x_train,y_train)
y_pred=gbc.predict(x_test)

print(accuracy_score(y_test,y_pred))
accuracy_dict["Gradient Boost"] = accuracy_score(y_test,y_pred)
models["Gradient Boost"] = gbc

In [None]:
from sklearn.svm import SVC
svc=SVC()
svc.fit(x_train,y_train)
y_pred=svc.predict(x_test)

print(accuracy_score(y_test,y_pred))
accuracy_dict["SVM"] = accuracy_score(y_test,y_pred)
models["SVM"] = svc

In [None]:
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

sc = StandardScaler()
X_train = sc.fit_transform(x_train)
X_test = sc.transform(x_test)
Y_train = y_train
Y_test = y_test
ann = tf.keras.models.Sequential()
ann.add(tf.keras.layers.Dense(units=200,activation="relu"))
ann.add(tf.keras.layers.Dense(units=100,activation="relu"))
ann.add(tf.keras.layers.Dense(units=20,activation="relu"))
ann.add(tf.keras.layers.Dense(units=10,activation="relu"))
ann.add(tf.keras.layers.Dense(units=1,activation="sigmoid"))
ann.compile(optimizer="adam",loss="binary_crossentropy",metrics=['accuracy'])
ann.fit(X_train,Y_train,batch_size=32,epochs=100,steps_per_epoch=250)
Y_pred = ann.predict(X_test)>0.5
ann.summary() 

print(accuracy_score(Y_test,Y_pred))
accuracy_dict["ANN"] = accuracy_score(Y_test,Y_pred)
models["ANN"] = ann

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

# for modeling
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping

model = Sequential()
model.add(Dense(16, input_shape=(X.shape[1],), activation='relu')) # Add an input shape! (features,)
model.add(Dense(32, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary() 
model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])

# early stopping callback
# This callback will stop the training when there is no improvement in  
# the validation loss for 10 consecutive epochs.  
es = EarlyStopping(monitor='val_accuracy', mode='max', patience=10, restore_best_weights=True)

# now we just update our model fit call
history = model.fit(X, y,
                    callbacks=[es],
                    epochs=100, # you can set this to a big number!
                    batch_size=32,
                    validation_split=0.2,
                    shuffle=True,
                    verbose=1)

In [None]:
accuracy_dict

In [None]:
def Preprocessing(data):
    data = nullval(data)
    data = convert(data)
    data = imputenull(data)
    return data[features]

In [None]:
test_data.info()

In [None]:
test_data = Preprocessing(test_data)
test_data.info()

In [None]:
l = max(accuracy_dict, key=accuracy_dict.get)
final_predict = models[l].predict(test_data)

In [None]:
submission = pd.read_csv("/kaggle/input/spaceship-titanic/sample_submission.csv")
submission["Transported"] = final_predict

In [None]:
submission.to_csv("submission.csv")

In [None]:
my_submission = pd.DataFrame({'PassengerId': submission.PassengerId, 'Transported': final_predict})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)

In [None]:
submission.info()