In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
import seaborn as sns
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn import metrics

from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error

import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

<h1>Spaceship Titanic Kaggle</h1>

In [None]:
from IPython.display import Image
Image("../input/data-spaceship/space-photo-03.jpg")

### To solve the Cosmic mystery of the spaceship Titanic is to understand why half of the passengers were transported and the others not.
### The challenge is to predict which passengers were transported.
### Thank for all the insteresting notebooks i had read before, the <a href="https://lazypredict.readthedocs.io/en/latest/readme.html">lazypredict</a> documentation and <a href="https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-py">sklearn</a> for all
### the useful information.

## GET AND READ THE DATA

In [None]:
from IPython.display import clear_output
!pip install -U lazypredict
!pip install -U pandas

clear_output()

In [None]:
import lazypredict
from lazypredict.Supervised import LazyClassifier

In [None]:
df_train=pd.read_csv('../input/spaceship-titanic/train.csv')
df_test=pd.read_csv('../input/spaceship-titanic/test.csv')

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:

df_train.describe().style.set_properties(**{'background-color': '#85C1E9','color': '#C0392B'})

In [None]:
transported=pd.DataFrame(df_train['Transported'].value_counts())
transported

In [None]:
plt.figure(figsize=(4,4))
labels=['True', 'False']
plt.pie(transported['Transported'], colors=['#22D8B4', '#8AED6F'], labels=labels, autopct='%1.1f%%',radius=2,
       startangle=90)
plt.title('Transported or not', fontsize=20, x=0.54, y=1.4)
plt.show()

## Some visualization

In [None]:
sns.set_theme(style="whitegrid")
ax=sns.displot(
    df_train, x="Age", col="HomePlanet", row="Transported",
    color='#0B606F',
    binwidth=3, height=3, facet_kws=dict(margin_titles=True),
)
ax.fig.suptitle('Age passenger per Home planet', x=0.54, y=1.1, fontsize=30)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(df_train.groupby(['Age']).size(), label="Age");
plt.legend();
plt.show()

In [None]:
df_view=pd.DataFrame(df_train['Age'].describe())
df_view.style.set_properties(**{'background-color': '#85C1E9',
                           'color': '#C0392B'})


In [None]:
plt.figure(figsize=(10,7))
sns.distplot(df_train[df_train.Transported==True]["Age"], label="Transported");
sns.distplot(df_train[df_train.Transported==False]["Age"], label="Not Transported");
plt.legend();
plt.show()

In [None]:
sns.set_theme(style="whitegrid")
ax=sns.displot(
    df_train, x="Age", col="Destination", row="Transported",
    color='#F53A24',
    binwidth=3, height=3, facet_kws=dict(margin_titles=True),
)
ax.fig.suptitle('Age passenger per Destination', x=0.54, y=1.1, fontsize=30)
plt.show()

In [None]:
sns.set_theme(style="whitegrid")
ax=sns.displot(
    df_train, x="Age", col="Destination", row="CryoSleep",
    color='#34495E',
    binwidth=3, height=3, facet_kws=dict(margin_titles=True),
)
ax.fig.suptitle('Destination Age passenger and CryoSleep', x=0.54, y=1.1, fontsize=30)
plt.show()

In [None]:
sns.set_theme(style="whitegrid")
ax=sns.displot(
    df_train, x="Age", col="HomePlanet", row="CryoSleep",
    color='#C0392B',
    binwidth=3, height=3, facet_kws=dict(margin_titles=True),
)
ax.fig.suptitle('Home planet and Age passenger per CryoSleep', x=0.54, y=1.1, fontsize=30)
plt.show()


In [None]:
plt.figure(figsize=(10,7))
sns.distplot(df_train[df_train.CryoSleep==False]["Age"], label="Not CryoSleep");
sns.distplot(df_train[df_train.CryoSleep==True]["Age"], label="CryoSleep");
plt.suptitle('CryoSleep or not', x=0.54, y=1, fontsize=30)
plt.legend();
plt.show()

In [None]:
sns.set_theme(style="whitegrid")
ax=sns.displot(
    df_train, x="Age", col="HomePlanet", row="Destination",
    color='#17A589',
    binwidth=3, height=3, facet_kws=dict(margin_titles=True),
)
ax.fig.suptitle('Home planet and Destination', x=0.54, y=1.1, fontsize=30)
plt.show()


## Starting Preprocessing

In [None]:
#Delete cabin name columns
df_train=df_train.drop(['PassengerId', 'Name'], axis=1)
df_test=df_test.drop(['PassengerId', 'Name'], axis=1)


In [None]:
df_train[['deck', 'num','side']] = df_train['Cabin'].str.split('/', expand=True)
df_train=df_train.drop(['Cabin'],axis=1)
df_train.head()

In [None]:
df_test[['deck', 'num','side']] = df_test['Cabin'].str.split('/', expand=True)
df_test=df_test.drop(['Cabin'],axis=1)
df_test.head()

In [None]:
df_train['deck']=df_train['deck'].replace({'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6, 'T':7})

In [None]:
df_test['deck']=df_test['deck'].replace({'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6, 'T':7})

In [None]:
df_train.info()

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
numerical_features=['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' ]
categorical_features=['HomePlanet', 'Destination','CryoSleep','deck', 'num', 'side','VIP']
All_columns= df_train[numerical_features + categorical_features]

## Imputer to manage nan values

In [None]:
#########Numerical##########
imputer = SimpleImputer(missing_values = np.nan,
                        strategy ='mean')
imputer = imputer.fit(df_train[numerical_features])
df_train[numerical_features] = imputer.transform(df_train[numerical_features])
df_test[numerical_features] = imputer.transform(df_test[numerical_features])
#########Categorical##########
imputer = SimpleImputer(missing_values = np.nan,
                        strategy='most_frequent')
imputer = imputer.fit(df_train[categorical_features])
df_train[categorical_features] = imputer.transform(df_train[categorical_features])
df_test[categorical_features] = imputer.transform(df_test[categorical_features])


## Verify nan values

In [None]:
df_train.isna().sum()

## Label encoder and Standard Scaler

In [None]:
###########Label Encoder And Srandard Scaler###############
def label_encoder(train,test,columns):
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = LabelEncoder().fit_transform(train[col])
        test[col] =  LabelEncoder().fit_transform(test[col])
    return train, test

df_train , df_test = label_encoder(df_train, df_test ,categorical_features)
##############StandardScaler#############
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df_train[numerical_features] = sc.fit_transform(df_train[numerical_features])
df_test[numerical_features] = sc.fit_transform(df_test[numerical_features])


In [None]:
df_train.head()

## Evaluation model with lazyclassifier

In [None]:
X=df_train.drop(['Transported'], axis=1)
y=df_train['Transported']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.5,random_state =123)

#X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2,stratify=y,random_state=142)
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None,classifiers='all')
models, predictions = clf.fit(X_train , X_test , y_train , y_test)
models.sort_values(by=['Accuracy'], ascending=False)

In [None]:
plt.figure(figsize=(17,10))
sns.lineplot(data=models.Accuracy, palette="tab10", linewidth=2.5)
plt.xticks(rotation=90)
plt.show()

## First evaluation with randomForestClassifier for test

In [None]:
X=df_train.drop(['Transported'], axis=1)
y=df_train['Transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, stratify=y, random_state=1243)

In [None]:
categorical_encoder = OneHotEncoder(handle_unknown="ignore")
numerical_pipe = Pipeline([("imputer", SimpleImputer(strategy="mean"))])

preprocessing = ColumnTransformer(
    [
        ("cat", categorical_encoder, categorical_features),
        ("num", numerical_pipe, numerical_features),
    ]
)

rf = Pipeline(
    [
        ("preprocess", preprocessing),
        ("classifier", RandomForestClassifier(random_state=12)),
    ]
)
rf.fit(X_train, y_train)
print("RF train accuracy: %0.3f" % rf.score(X_train, y_train))
print("RF test accuracy: %0.3f" % rf.score(X_test, y_test))

## LGBMClassifier score and features

In [None]:
X=df_train.drop(['Transported'], axis=1)
y=df_train['Transported']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, stratify=y, random_state=32)

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(base_estimator=None, n_estimators=100, learning_rate=1, random_state=0) 
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test)) # Accuracy moyenne
y_pred = clf.predict(X_test)
pd.crosstab(y_test, y_pred, colnames=['Predicted'])

In [None]:
import lightgbm as lgb
model=lgb.LGBMClassifier(boosting_type='gbdt',
                         n_estimators=10000,
                         num_leaves=32,
                         max_bin=10,
                         learning_rate=0.05,
                         class_weight = None,
                         min_child_samples = 20,
                         subsample_for_bin = 10000,
                         num_iterations=80,
                         n_jobs=-1,
                         random_state=1234).fit(X_train, y_train)
print(model.score(X_test, y_test)) # Accuracy moyenne
y_pred = model.predict(X_test)
pd.crosstab(y_test, y_pred, colnames=['Predicted'])

In [None]:
model.fit(X_train,y_train,eval_set=[(X_test,y_test),(X_train,y_train)],
          verbose=20,eval_metric='logloss')

In [None]:
print("RF train accuracy: %0.3f" % model.score(X_train, y_train))
print("RF test accuracy: %0.3f" % model.score(X_test, y_test))

In [None]:
df_importance = pd.DataFrame({'feature': list(X_train.columns), 'importance': model.feature_importances_})
df_importance = df_importance.sort_values(by=['importance'], ascending  = True).reset_index(drop = True)
plt.figure(figsize=(7,7))
ax = plt.barh(y=df_importance.feature, width=df_importance.importance, color='r')
plt.title('Most important features'.format(14))
plt.show()

In [None]:
lgb.plot_metric(model)

In [None]:
metrics.plot_confusion_matrix(model,X_test,y_test,cmap='Blues_r')

In [None]:
print(metrics.classification_report(y_test,model.predict(X_test)))

In [None]:
import time
start = time.time()
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
lgb=lgb.LGBMClassifier()

parameters = {'num_leaves':[20,40,60,80,100], 'min_child_samples':[5,10,15],'max_depth':[-1,5,10,20],
             'learning_rate':[0.05,0.1,0.2],'reg_alpha':[0,0.01,0.03]}

clf=GridSearchCV(lgb,parameters,scoring='accuracy')
clf.fit(X=X_train, y=y_train)
#print(clf.best_params_)
#predicted=clf.predict(X_test)
#print('Classification of the result is:')
#print(accuracy_score(y_test, predicted))
#end = time.time()
#print('Execution time is:')
#print(end - start)

In [None]:
categorical_encoder = OneHotEncoder(handle_unknown="ignore")
numerical_pipe = Pipeline([("imputer", SimpleImputer(strategy="mean"))])

preprocessing = ColumnTransformer(
    [
        ("cat", categorical_encoder, categorical_features),
        ("num", numerical_pipe, numerical_features),
    ]
)

clf= Pipeline(
    [
        ("preprocess", preprocessing),
        ("classifier", clf),
    ]
)

clf.fit(X_train, y_train)

In [None]:
print("RF train accuracy: %0.3f" % clf.score(X_train, y_train))
print("RF test accuracy: %0.3f" % clf.score(X_test, y_test))

In [None]:
prediction=clf.predict(df_test)
submission = pd.read_csv("../input/spaceship-titanic/sample_submission.csv")
submission["Transported"] = prediction
submission.to_csv("submission.csv", index=False)

## Using AdaBoostClassifier

## Evaluation

In [None]:
from sklearn.datasets import make_classification
from sklearn.ensemble import AdaBoostClassifier


X=df_train.drop(['Transported'], axis=1)
y=df_train['Transported']
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y, random_state=0)




## In a pipeline

In [None]:
categorical_encoder = OneHotEncoder(handle_unknown="ignore")
numerical_pipe = Pipeline([("imputer", SimpleImputer(strategy="mean"))])

preprocessing = ColumnTransformer(
    [
        ("cat", categorical_encoder, categorical_features),
        ("num", numerical_pipe, numerical_features),
    ]
)

clf = Pipeline(
    [
        ("preprocess", preprocessing),
        ("classifier", AdaBoostClassifier(n_estimators=100, random_state=0)),
    ]
)

clf.fit(X_train, y_train)

## Features Influence

## Score and Submission

In [None]:
print("RF train accuracy: %0.3f" % clf.score(X_train, y_train))
print("RF test accuracy: %0.3f" % clf.score(X_test, y_test))

In [None]:
submission['Transported'].value_counts()

In [None]:
submission.head()