In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
train_string = "/kaggle/input/spaceship-titanic/train.csv"
test_string = "/kaggle/input/spaceship-titanic/test.csv"
train = pd.read_csv(train_string)
train_copy = pd.read_csv(train_string)
test = pd.read_csv(test_string)

## Exploratory Data Analysis

In [None]:
train_copy.groupby("Transported").agg("mean").reset_index()

From this, we can see that age, RoomService, Spa, VRDeck are all important features 

In [None]:
sns.barplot('HomePlanet','Transported', data=train_copy)
plt.show()

HomePlanet has a great impact on the Transported status from above bar plot. This will be part of our model. 

In [None]:
sns.barplot('CryoSleep','Transported', data=train_copy)
plt.show()

CryoSleep has a great impact on the Transported status. This feature will definitely be part of our model. 

In [None]:
sns.barplot('Destination','Transported', data=train_copy)
plt.show()

Desination also has some impact on the Transported status. We will probably include this in our model as well. 

In [None]:
sns.barplot('VIP','Transported', data=train_copy)
plt.show()

VIP feature does not look that important, so we will not include this in our model

## Feature Engineering

In [None]:
train_copy.groupby("Transported").agg("mean").reset_index()

RoomService, Spa, and VRDeck have significant differences between Transported status. We can group these features into one feature Luxury, which contains these luxury costs. 

The other features FoodCourt and ShoppingMall can go under basic feature.

In [None]:
train_copy

We can extract if the passenger was alone or with family from the passengerId. 

We can extract Cabin deck, deck number, and side from Cabin feature

Names are unique, so we cannot extract anything from the feature

In [None]:
train_copy['Deck'] = train_copy['Cabin'].str.extract('([A-Z])/', expand=True)
train_copy['Deck_No'] = train_copy['Cabin'].str.extract('(\d)/', expand=True)
train_copy['Side'] = train_copy['Cabin'].str.extract('/([A-Z])', expand=True)

In [None]:
sns.barplot('Deck','Transported', data=train_copy)
plt.show()

Deck is an important feature from the variety in the bar heights. We will include this in the model. 

In [None]:
sns.barplot('Deck_No','Transported', data=train_copy)
plt.show()

Because the bar heights are almost the same for all bars, we will not use this feature in the model

In [None]:
sns.barplot('Side','Transported', data=train_copy)
plt.show()

There is some difference in the bar heights, so we will use Side as one of the features in the model. 

In [None]:
train_copy['Group'] = ''
for i in range(0, len(train_copy)):
    train_copy['Group'][i] = train_copy['PassengerId'][i].split('_')[0]
groups = train_copy["Group"].values.tolist()
support = []
support_number = []

for g in groups:
    number = groups.count(g) - 1
    if number == 0:
        support.append("Alone")
    else:
        support.append("Family")
    support_number.append(number)
train_copy["Support"] = support

In [None]:
sns.barplot('Support','Transported', data=train_copy)
plt.show()

Alone vs Family is an important feature so we will use this for our model

## Data Processing

In [None]:
def process(df, test_or_train):
    #Create alone vs family feature from passenger id
    df['Group'] = ''
    for i in range(0, len(df)):
        df['Group'][i] = df['PassengerId'][i].split('_')[0]
    groups = df["Group"].values.tolist()
    support = []
    support_number = []

    for g in groups:
        number = groups.count(g) - 1
        if number == 0:
            support.append("Alone")
        else:
            support.append("Family")
        support_number.append(number)
    df["Support"] = support
    
    #Fill null values with mean
    df['Age'] = df['Age'].fillna((df['Age'].mean()))
    df['RoomService'] = df['RoomService'].fillna((df['RoomService'].mean()))
    df['FoodCourt'] = df['FoodCourt'].fillna((df['FoodCourt'].mean()))
    df['ShoppingMall'] = df['ShoppingMall'].fillna((df['ShoppingMall'].mean()))
    df['Spa'] = df['Spa'].fillna((df['Spa'].mean()))
    df['VRDeck'] = df['VRDeck'].fillna((df['VRDeck'].mean()))

    final_df = pd.DataFrame()
    final_df["HomePlanet"] = df["HomePlanet"]
    final_df["CryoSleep"] = df["CryoSleep"]
    final_df["Destination"] = df["Destination"]
    if test_or_train:
        final_df["Transported"] = df["Transported"]
    final_df["Support"] = df["Support"]
    final_df["Age"] = df["Age"]
    # final_df["AgeGroup"] = pd.cut(x=df["Age"], bins = [-1, 18, 40, 65, 100], labels = ["Child", "YA", "MiddleAge", "Old"])
    final_df["RoomService"] = df["RoomService"]
    final_df["Basic"] = df["FoodCourt"] + df["ShoppingMall"]
    final_df["Spa"] = df["Spa"]
    final_df["VRDeck"] = df["VRDeck"]
    # final_df["FoodCourt"] = df["FoodCourt"]
    # final_df["ShoppingMall"] = df["ShoppingMall"]
    # final_df["Spa"] = df["Spa"]
    # final_df["VRDeck"] = df["VRDeck"]
    # final_df["RoomVal"] = df["RoomService"].isnull()
    # final_df["FoodVal"] = df["FoodCourt"].isnull()
    # final_df["S_val"] = df["ShoppingMall"].isnull()



    final_df['Deck'] = df['Cabin'].str.extract('([A-Z])/', expand=True)
   
    final_df['Side'] = df['Cabin'].str.extract('/([A-Z])', expand=True)

    #one hot encoding
    final_df = pd.get_dummies(final_df, columns=['Deck', 'Side', 'Support', 'HomePlanet','Destination', 'CryoSleep'], prefix=['Deck', 'Side', 'Support','HomePlanet','Destination', 'CryoSleep'])

    # creating luxury feature
    final_df["Luxury"] = df["RoomService"] + df["Spa"] + df["VRDeck"]
    final_df.drop(["Spa", "RoomService", "VRDeck"], axis = 1)

    return final_df

In [None]:
new_df = process(train, True)

## Model Development

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
cv = KFold(n_splits=10)
y = new_df["Transported"]
new_df.drop(["Transported"], axis = 1, inplace = True)
# train_X, test_X, train_y, test_y = train_test_split(new_df, y, test_size = 0.2, random_state = 2)
model = RandomForestClassifier(bootstrap=True, n_estimators=1600, min_samples_split=2, min_samples_leaf=4, max_features='sqrt', 
max_depth=10)

# k fold cross validatin
scores = cross_val_score(model, new_df, y, scoring='accuracy',
                         cv=cv, n_jobs=-1)
print(scores)

In [None]:
model.fit(new_df, y)

In [None]:
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.figsize': (12.0, 16)})
plt.barh(new_df.columns, model.feature_importances_)

In [None]:
process_test = process(test, False)
prediction = model.predict(process_test)
final_submission = pd.DataFrame()
final_submission["PassengerId"] = test["PassengerId"]
final_submission["Transported"] = prediction
final_submission.to_csv("submission.csv", index=False)