In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv('../input/spaceship-titanic/train.csv')
df_test = pd.read_csv('../input/spaceship-titanic/test.csv')
df_train.head(4).style.background_gradient()

In [None]:
print(f'Train Data Shape: {df_train.shape}')
print(f'Test Data Shape: {df_test.shape}')

In [None]:
df_train.describe().style.background_gradient()

In [None]:
Transported_data = df_train.Transported

In [None]:
df_combined = pd.concat([df_train.drop('Transported', axis=1), df_test], axis=0)
print(f'Shape of the combined dataset: {df_combined.shape}')

In [None]:
df_combined.isnull().sum()

In [None]:
df_combined1 = df_combined.copy()

categorical = df_combined1.columns[df_combined1.dtypes == 'object']
print(f'Columns with categorical data:\n {categorical}\n')

numerical = df_combined1.columns[df_combined1.dtypes != 'object']
print(f'Columns with numerical data:\n {numerical}')

In [None]:
x = df_combined1.HomePlanet.mode()
y = x.to_string()
print(f'String before: {y}')

y_converted_to_string = y.strip('0 ')
print(f'String after: {y_converted_to_string}')

In [None]:
for i in categorical:
    if i != 'CryoSleep':
        x = df_combined1[i].mode()
        y = x.to_string()
        y_converted_to_string = y.strip('0 ')
        df_combined1[i].fillna(y_converted_to_string, inplace=True)

for i in numerical:
    df_combined1[i].fillna(df_combined1[i].median(), inplace=True)
    
df_combined1.isnull().sum()

In [None]:
df_combined1.CryoSleep.mode()

In [None]:
df_combined1.CryoSleep.fillna(bool(0), inplace=True)

In [None]:
df_combined1.isnull().sum()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf
import plotly.express as px

# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# init_notebook_mode(connected=True)

cf.go_offline()

In [None]:
df_visualize = df_combined1[:8693]
df_visualize['Transported'] = Transported_data

In [None]:
fig = px.histogram(df_visualize, y='HomePlanet', color='Transported', 
                  color_discrete_map={
                    True: "MediumPurple",
                    False: "lightblue"
                    }, width=700, height=400)
fig.update_layout(template='plotly_dark', title='Original Planet and Transported People',
                 font = dict(family = "PT Sans", size = 14))
fig

In [None]:
fig = px.histogram(df_visualize, y='CryoSleep', color='Transported', 
                  color_discrete_map={
                    True: "seagreen",
                    False: "yellowgreen"
                    },
                  width=700, height=400)
fig.update_layout(template='plotly_dark', title='Cryosleep and Transported People Analysis',
                 font = dict(family = "PT Sans", size = 14),
                 yaxis_title= 'Was the passenger in Cryosleep?',
                 xaxis_title='Number of People')

fig

In [None]:
fig = px.histogram(df_visualize, y='Destination', color='Transported', 
                  color_discrete_map={
                    True: "saddlebrown",
                    False: "lightcoral"
                    },
                  width=700, height=400)
fig.update_layout(template='plotly_dark', title='Destination and Transported people analysis',
                 font = dict(family = "PT Sans", size = 14),
                 yaxis_title= 'Destination',
                 xaxis_title='Number of People')

fig

In [None]:
# 'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'
fig = px.box(df_visualize, y='Age', color='Transported',
             color_discrete_map={
                    True: "MediumPurple",
                    False: "lightblue"
                    },
            points='all', title='Distribution of the Age',
            width=700, height=400)

fig.update_layout(template='plotly_dark', font=dict(family='Sans', size=17))
fig.update_traces(marker = dict(size = 0.75))
fig

In [None]:
df_visualize['Recreational_activities'] = df_visualize['RoomService']+df_visualize['Spa']
+df_visualize['FoodCourt']+df_visualize['ShoppingMall']+df_visualize['VRDeck']

In [None]:
fig = px.box(df_visualize, x='Recreational_activities', color='Transported',
             color_discrete_map={
                    True: 'orchid',
                    False: "lightblue"
                    },
            points='all', title='Distribution of the expenditure on Room Service',
            width=700, height=400)

fig.update_layout(template='plotly_dark', font=dict(family='Sans', size=17),
                 xaxis_title='Expenditure on Recreational Activities')
fig.update_traces(marker = dict(size = 0.75))
fig

In [None]:
df_combined1.head(2).style.background_gradient()

In [None]:
Passenger_Id_of_Test_data = df_combined1.PassengerId[8693:]
df_combined1.PassengerId = df_combined1.PassengerId.apply(lambda x: x[:4])

In [None]:
df_combined1.PassengerId[:10]

In [None]:
df_combined1.drop(['Name','Cabin', 'PassengerId'], axis=1, inplace=True)
df_combined1.head(2)

In [None]:
df_final = pd.get_dummies(df_combined1, 
                          columns=categorical.drop(labels=['Name', 'Cabin', 'PassengerId']), 
                          drop_first=True)

In [None]:
df_final.head()

In [None]:
df_final.shape

In [None]:
train_data = df_final[:8693]

In [None]:
X = train_data
y = Transported_data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_data, y, 
                                                    test_size=0.25, random_state=12)

In [None]:
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression(max_iter=10000)

model1.fit(X_train, y_train)
model1.score(X_test, y_test)

In [None]:
from sklearn.svm import SVC
model2 = SVC()

model2.fit(X_train, y_train)
model2.score(X_test, y_test)

In [None]:
test_data = df_final[8693:]
predictions = model2.predict(test_data)

In [None]:
final_pred = pd.concat([Passenger_Id_of_Test_data, pd.DataFrame(predictions)], axis=1)
final_pred.rename(columns={0:'Transported'}, inplace=True)

In [None]:
final_pred.head()

In [None]:
final_pred.to_csv('/kaggle/working/submission.csv', index=False)