# Spaceship Titanic Kaggle Project

In [None]:
import pandas as pd

In [None]:
import numpy as np
import matplotlib.pyplot as plt

## Importing Data

In [None]:
df = pd.read_csv("data/train.csv")

In [None]:
df

In [None]:
df.shape

## Studying Data

In [None]:
df["Transported"].value_counts()

There are 4378 people (50.36%) who were transported and 4315 people (49.64%) who were not

In [None]:
df["Transported"].value_counts().plot(kind="bar", color=["salmon", "lightblue"])

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
df.median(axis=0, skipna = True)

In [None]:
df.Age.plot.hist()

The above histogram shows that most people in our dataset are 16 to 30 years of age

In [None]:
unique_values = df.nunique()
unique_values

In [None]:
df["HomePlanet"].unique()

In [None]:
pd.crosstab(df.HomePlanet, df.Transported)

In [None]:
# Visulaizing the crosstab
pd.crosstab(df.HomePlanet, df.Transported).plot(kind="bar",
                                               figsize=(10,6),
                                               color=["lightcoral", "aquamarine"])
# Adding information to the plot
plt.title("Transported people according to Home Planet")
plt.xlabel("Home Planet")
plt.ylabel("Frequency")
plt.legend(["Not Transported", "Transported"])
plt.xticks(rotation=0);

In [None]:
unique_values

In [None]:
pd.crosstab(df.HomePlanet, df.CryoSleep)

In [None]:
pd.crosstab(df.HomePlanet, df.CryoSleep).plot(kind="bar",
                                             figsize=(10,6),
                                             color=["salmon", "lightblue"])
plt.xticks(rotation=0)
plt.ylabel("Frequency")
plt.xlabel("Home Planet")
plt.title("Number of passengers that are in Cryo Sleep according to Home Planet");

In [None]:
unique_values

In [None]:
# Correlation matrix
corr_matrix = df.corr()
corr_matrix

In [None]:
import seaborn as sns
fig, ax = plt.subplots(figsize=(15, 10))
ax = sns.heatmap(corr_matrix,
                annot=True,
                linewidths=0.5,
                fmt=".2f",
                cmap="YlGnBu")

## Manipulating Data 
so that it can be used to train model

In [None]:
df.isna().sum()

In [None]:
df["HomePlanet"].head()

In [None]:
# Converting PassengerId to numeric
df_mod = df
df_mod['PassengerId'] = df['PassengerId'].astype(str).astype(int)
df_mod

In [None]:
df_mod.info()

In [None]:
# Converting string values into category values
for label, content in df_mod.items():
    if pd.api.types.is_string_dtype(content):
        df_mod[label] = content.astype("category").cat.as_ordered()

In [None]:
df_mod.info()

In [None]:
# Checking if there is any string column
for label, content in df_mod.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

In [None]:
# Saving data
df_mod.to_csv("modified-train-data.csv", index=False)

In [None]:
df_mod.isnull().sum()

## Filling missing values

In [None]:
# Checking for numeric data types
for label, content in df_mod.items():
    if pd.api.types.is_numeric_dtype(content):
        print(label)

In [None]:
# Checking which numeric columns have null values
for label, content in df_mod.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)

In [None]:
print(df_mod.Age.median())
print(df_mod.RoomService.median())
print(df_mod.FoodCourt.median())
print(df_mod.ShoppingMall.median())
print(df_mod.Spa.median())
print(df_mod.VRDeck.median())

In [None]:
print(df_mod.Age.mean())
print(df_mod.RoomService.mean())
print(df_mod.FoodCourt.mean())
print(df_mod.ShoppingMall.mean())
print(df_mod.Spa.mean())
print(df_mod.VRDeck.mean())

In [None]:
# Filling missing numeric rows with --median
def fill_num_val(df):
    for label, content in df.items():
        if pd.api.types.is_numeric_dtype(content):
            if pd.isnull(content).sum():
                df[label + "_is_missing"] = pd.isnull(content)
                df[label] = content.fillna(content.median())

In [None]:
fill_num_val(df_mod)

In [None]:
# Checking which numeric columns have null values
def check_num_values(df):
    for label, content in df.items():
        if pd.api.types.is_numeric_dtype(content):
            if pd.isnull(content).sum():
                print(label)

In [None]:
check_num_values(df_mod)

In [None]:
df_mod

In [None]:
df_mod.isna().sum()

## Filling categorical values into numbers

In [None]:
# Checking categorical variables
def check_cat_var(df):
    for label, content in df.items():
        if not pd.api.types.is_numeric_dtype(content):
            print(label)

In [None]:
check_cat_var(df_mod)

In [None]:
# Filling categorical null values
def fill_cat_null_values(df):
    for label, content in df.items():
        if not pd.api.types.is_numeric_dtype(content):
            df[label + "_is_missing"] = pd.isnull(content)
            df[label] = pd.Categorical(content).codes + 1

In [None]:
fill_cat_null_values(df_mod)

In [None]:
pd.Categorical(df_mod["HomePlanet"]).codes

In [None]:
df_mod.info()

In [None]:
df_mod.head()

In [None]:
df_mod.isna().sum()

In [None]:
# Saving modified data
df_mod.to_csv("data/modified-data.csv", index=False)
print("Data Saved")

## Modelling

In [None]:
df_mod.head()

In [None]:
df_mod["PassengerId"]

In [None]:
# df_train = df_mod.iloc[:6950]
# df_val = df_mod[6950:]
# df_train.shape, df_val.shape

In [None]:
# # Building training dataset
# X_train = df_train.drop("Transported", axis=1)
# y_train = df_train.Transported

# # Building validation dataset
# X_valid = df_val.drop("Transported", axis = 1)
# y_valid = df_val.Transported

# # Determing shape
# X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

In [None]:
X = df_mod.drop("Transported", axis=1)
y = df_mod["Transported"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

In [None]:
rfc_score = clf.score(X_test, y_test)
rfc_score

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier()
neigh.fit(X_train, y_train)

In [None]:
neigh_score = neigh.score(X_test, y_test)
neigh_score

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
lr_score = lr.score(X_test, y_test)
lr_score

In [None]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

In [None]:
svc_score = svc.score(X_test, y_test)
svc_score

In [None]:
model_scores = {
    "Random Forest Classifier" : rfc_score,
    "KNeighborsClassifier" : neigh_score,
    "Logistic Regression" : lr_score,
    "SVC Score" : svc_score
}
model_scores

In [None]:
model_compare = pd.DataFrame(model_scores, index=["accuracy"])
model_compare.T.plot(kind="bar");

Since the most accurate model is RandomForestClassifier() we will tune hyperparameter on that

## GridsearchCV

In [None]:
grid = {
    "n_estimators" : [10, 100, 200, 500, 1000, 1200],
    "max_depth" : [None, 5, 10, 20, 30],
    "min_samples_split" : np.arange(2,20,2),
    "min_samples_leaf" : np.arange(1, 20, 2),
    "max_features" : [0.5, 1, "sqrt"]
}

In [None]:
%%time
from sklearn.model_selection import GridSearchCV
np.random.seed(42)
clf = RandomForestClassifier(n_jobs=-1,
                            max_samples=1000)
gs_clf = GridSearchCV(estimator=clf,
                     param_grid=grid,
                     cv=5,
                     verbose=2)
gs_clf.fit(X_train, y_train)

In [None]:
gs_clf.best_params_

In [None]:
gs_clf.score(X_test, y_test)

In [None]:
y_preds = gs_clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)

In [None]:
grid_lr = {
    "penalty" : ['l1' , 'l2', 'elasticnet', None],
    "solver" : ['lbfgs', 'liblinear', 'newton-cg', 'saga'],
    
}

In [None]:
lr = LogisticRegression(n_jobs=-1)
gs_lr = GridSearchCV(estimator=lr,
                    param_grid=grid_lr,
                    cv=5,
                    verbose=2)
gs_lr.fit(X_train, y_train)

In [None]:
gs_lr.score(X_test, y_test)

In [None]:
model_scores

In [None]:
gs_lr.best_params_

# Working on test data

In [None]:
test_df = pd.read_csv('data/test.csv')
test_df

In [None]:
test_df.isna().sum()

In [None]:
# Checking num values
check_num_values(test_df)

In [None]:
fill_num_val(test_df)

In [None]:
check_num_values(test_df)

In [None]:
check_cat_var(test_df)

In [None]:
fill_cat_null_values(test_df)

In [None]:
test_df

In [None]:
X_train.head()

In [None]:
test_df.head()

In [None]:
test_df.columns

In [None]:
X_train.columns

In [None]:
test_df = test_df.drop('PassengerId_is_missing', axis = 1)

In [None]:
test_preds = gs_lr.predict(test_df)

In [None]:
test_preds

In [None]:
getting_pass_id = pd.read_csv('data/test.csv')

In [None]:
getting_pass_id['PassengerId']

In [None]:
df_preds = pd.DataFrame()

In [None]:
df_preds["PassengerId"] = getting_pass_id["PassengerId"]
df_preds["Transported"] = test_preds
df_preds

In [None]:
df_preds.to_csv('data/test_preds_2.csv', index=False)