# **Configuring**


### *Import libraries*

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split

### *Reading files*

In [None]:
train_data = pd.read_csv('../input/spaceship-titanic/train.csv')
test_data = pd.read_csv('../input/spaceship-titanic/test.csv')

# **EDA**

### *Checking train and test files*

In [None]:
# Checking the structure of test file
test_data.head()

In [None]:
print(f'Number of rows in train data: {test_data.shape[0]}')
print(f'Number of columns in train data: {test_data.shape[1]}')

In [None]:
test_data.describe()

In [None]:
print("Missing values in the test file")
print((test_data.isna().sum().sort_values(ascending = False)))

In [None]:
train_data.head()

In [None]:
print(f'Number of rows in train data: {train_data.shape[0]}')
print(f'Number of columns in train data: {train_data.shape[1]}')

In [None]:
train_data.describe()

In [None]:
print("Missing values in the train file")
print(train_data.isna().sum().sort_values(ascending = False))

### *Checking null values*

In [None]:
test_null = pd.DataFrame(test_data.isna().sum())
test_null = test_null.sort_values(by = 0 ,ascending = False)
train_null = pd.DataFrame(train_data.isna().sum())
train_null = train_null.sort_values(by = 0 ,ascending = False)[:-1]

fig = make_subplots(rows=1, 
                    cols=2,
                    column_titles = ["Train Data", "Test Data"] ,
                    x_title="Missing Values")

fig.add_trace(go.Bar(x=train_null[0],
                     y=train_null.index,
                     orientation="h",
                    marker=dict(color=[n for n in range(12)], 
                                line_color='rgb(0,0,0)' , 
                                line_width = 2,
                                coloraxis="coloraxis")),
              1, 1)

fig.add_trace(go.Bar(x=test_null[0], 
                     y=test_null.index,
                     orientation="h",
                    marker=dict(color=[n for n in range(12)], 
                                line_color='rgb(0,0,0)', 
                                line_width = 2,
                                coloraxis="coloraxis")),
              1, 2)

fig.update_layout(showlegend=False, title_text="Distribution of null values in the column", title_x=0.5)

### *Distribution of Age*

In [None]:
train_age = train_data.copy()
test_age = test_data.copy()
train_age["type"] = "Train"
test_age["type"] = "Test"
ageDf = pd.concat([train_age, test_age])
fig = px.histogram(data_frame = ageDf, 
                   x="Age",
                   color= "type",
                   color_discrete_sequence =  ['#58D68D','#DE3163'],
                   marginal="box",
                   nbins= 100,
                    template="plotly_white"
                )
fig.update_layout(title = "Distribution of Age" , title_x = 0.5)
fig.show()

### *Target Distribution*


In [None]:
TARGET = "Transported"
target_df = pd.DataFrame(train_data[TARGET].value_counts()).reset_index()
target_df.columns = [TARGET, 'count']
fig = px.bar(data_frame =target_df, 
             x = TARGET,
             y = 'count'
            ) 
fig.update_traces(marker_color =['#58D68D','#DE3163'], 
                  marker_line_color='rgb(0,0,0)',
                  marker_line_width=2,)
fig.update_layout(title = "Target Distribution",
                  template = "plotly_white",
                  title_x = 0.5)
print("\033[94mPercentage of Transported = 0: {:.2f} %".format(target_df["count"][0] *100 / train_data.shape[0]))
print("\033[94mPercentage of Transported = 1: {:.2f} %".format(target_df["count"][1]* 100 / train_data.shape[0]))
fig.show()

### *Preparing data for modeling*

In [None]:
RANDOM_STATE = 12 
FOLDS = 5
STRATEGY = 'median'

# Filling null values
imputer_cols = ["Age", "FoodCourt", "ShoppingMall", "Spa", "VRDeck" ,"RoomService"]
imputer = SimpleImputer(strategy=STRATEGY )
imputer.fit(train_data[imputer_cols])
train_data[imputer_cols] = imputer.transform(train_data[imputer_cols])
test_data[imputer_cols] = imputer.transform(test_data[imputer_cols])
train_data["HomePlanet"].fillna('Z', inplace=True)
test_data["HomePlanet"].fillna('Z', inplace=True)

In [None]:
label_cols = ["HomePlanet", "CryoSleep","Cabin", "Destination" ,"VIP"]
def label_encoder(train_data,test_data,columns):
    for col in columns:
        train_data[col] = train_data[col].astype(str)
        test_data[col] = test_data[col].astype(str)
        train_data[col] = LabelEncoder().fit_transform(train_data[col])
        test_data[col] =  LabelEncoder().fit_transform(test_data[col])
    return train_data, test_data

train_data ,test_data = label_encoder(train_data,test_data ,label_cols)

In [None]:
train_data.drop(["Name" ,"Cabin"] , axis = 1 ,inplace = True)
test_data.drop(["Name" ,"Cabin"] , axis = 1 ,inplace = True)

# Modeling

In [None]:
from sklearn.ensemble import RandomForestClassifier

y = train_data[TARGET]
features = ["Age", "FoodCourt", "ShoppingMall", "Spa", "VRDeck" ,"RoomService", "Destination", "HomePlanet", "CryoSleep"]

X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=2)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Transported': predictions})
output.to_csv('submission.csv', index=False)