In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Prepare data

In [None]:
df_train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

In [None]:
df_train.head()

# EDA 

In [None]:
#check the data set
df_train.info()

In [None]:
#check which features of train data have missing data 
df_train.isna().sum()

In [None]:
#check which features of test data have missing data 
df_test.isna().sum()

In [None]:
#Handling Missing Data
#imputing missing values with Mode value

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
#Train & Test columns
train_columns = df_train.columns
test_columns = df_test.columns

In [None]:
#Imputing values
imputer = SimpleImputer(strategy="most_frequent")
df_train = imputer.fit_transform(df_train)
df_test = imputer.fit_transform(df_test)

In [None]:
print(df_train)
#It's in nddarray form 

In [None]:
#Bring columns back
df_train = pd.DataFrame(df_train,columns=train_columns)
df_test = pd.DataFrame(df_test,columns=test_columns)

In [None]:
df_train.head(5)

In [None]:
#Seperate fetures for any correlation
cabin_columns = ['Deck','Deck Number','Side']
id_columns = ['Passenger Group','Passenger Number']

#split train data set
sepr_cabin = df_train['Cabin'].str.split('/', n =-1, expand = True)
sepr_id = df_train['PassengerId'].str.split('_', n =-1, expand = True)
#Provide columns name 
sepr_cabin.columns = cabin_columns
sepr_id.columns = id_columns

#split test data set
sepr_cabin_test = df_test['Cabin'].str.split('/', n =-1, expand = True)
sepr_id_test = df_test['PassengerId'].str.split('_', n =-1, expand = True)
#Provide columns name 
sepr_cabin_test.columns = cabin_columns
sepr_id_test.columns = id_columns

#combined Cabin_columns,Id_columns into main data frame 
df_train = pd.concat([df_train,sepr_cabin,sepr_id],axis=1)
df_test = pd.concat([df_test,sepr_cabin_test,sepr_id_test],axis=1)


In [None]:
#Drops columns not involved:
df_train.drop(columns = ['PassengerId','Cabin'],inplace =True)
df_test.drop(columns = ['PassengerId','Cabin'],inplace =True)

In [None]:
df_train.head(5)

**Visualization**

In [None]:
df_train.describe(include=['O'])

In [None]:
#Correlation between HomePlanet and Transported
sns.catplot(x="HomePlanet", y="Transported", kind="bar", data=df_train)

In [None]:
#Correlation between cryrosleep and Transported
sns.catplot(x="CryoSleep", y="Transported", kind="bar", data=df_train)

In [None]:
#Correlation between Destination and Transported
sns.catplot(x="Destination", y="Transported", kind="bar", data=df_train)

In [None]:
#Correlation between VIP	and Transported
sns.catplot(x="VIP", y="Transported", kind="bar", data=df_train)

In [None]:
#Correlation between Deck and Transported
sns.catplot(x="Deck", y="Transported", kind="bar", data=df_train)

In [None]:
#Correlation between Side and Transported
sns.catplot(x="Side", y="Transported", kind="bar", data=df_train)

In [None]:
#Correlation between Passenger Number and Transported
sns.catplot(x="Passenger Number", y="Transported", kind="bar", data=df_train)

In [None]:
#Data preprocessing
# Encoding categorical data
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder

In [None]:
cat_columns = ['HomePlanet','CryoSleep','Destination','VIP','Deck','Side','Passenger Number']
num_columns = ['Age','RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [None]:
# Encoding categorical data
encoder_train = OrdinalEncoder().fit_transform(df_train[cat_columns])
encoder_train = pd.DataFrame(encoder_train,columns = cat_columns)

encoder_test = OrdinalEncoder().fit_transform(df_test[cat_columns])
encoder_test = pd.DataFrame(encoder_test,columns = cat_columns)

In [None]:
x_train = pd.concat([encoder_train,df_train[num_columns]],axis=1)
x_test = pd.concat([encoder_test,df_test[num_columns]],axis=1)

y_train = df_train['Transported']
label_encoder = LabelEncoder().fit_transform(y_train)
y_train = pd.DataFrame(label_encoder,columns = ['Transported'])

#Modeling

In [None]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,ConfusionMatrixDisplay,f1_score
from sklearn.model_selection import train_test_split

In [None]:
# Splitting data for train and validation
X_train,X_test,Y_train,Y_test = train_test_split(x_train,y_train, test_size=0.25,random_state = 42)

In [None]:
model_rf = RandomForestClassifier(max_depth=10,random_state=42)
model_rf.fit(X_train,Y_train)
y_pred = model_rf.predict(X_test)

In [None]:
print("Accuracy Score: ",accuracy_score(Y_test,y_pred))
print("F1 Score: ",f1_score(Y_test,y_pred))

In [None]:
y_final = pd.DataFrame(model_rf.predict(x_test),columns=['Transported'])

In [None]:
y_final

In [None]:
Psnger_group_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

In [None]:
submission = pd.concat([Psnger_group_test['PassengerId'],y_final],axis=1)

In [None]:
submission['Transported'] = submission['Transported'].replace({0:False,1:True})

In [None]:
submission.set_index('PassengerId')

In [None]:
submission.to_csv('submission.csv',index=False)