In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from IPython.display import SVG
from graphviz import Source
import itertools
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
df_test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
df_sub = pd.read_csv("/kaggle/input/spaceship-titanic/sample_submission.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df = df.fillna(df.median())

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

# Exploratory Data Analysis

In [None]:
df2 = df.groupby("HomePlanet")[["Transported"]].count().reset_index()
fig = px.bar(df2, x='HomePlanet', y='Transported',  height=400, text_auto= True)
fig.show()

In [None]:
df2 = df.groupby("Destination")[["Transported"]].count().reset_index().sort_values(["Transported"], ascending=False)
fig = px.bar(df2, x='Destination', y='Transported',  height=400, text_auto= True, color_discrete_sequence=px.colors.qualitative.Dark24)
fig.show()

In [None]:
df2= df.groupby(["Transported"])[["Age"]].mean().reset_index()
fig = px.pie(df2, values='Age', names='Transported', height=400)
fig.show()

In [None]:
df2 = df.groupby(["HomePlanet"])[["RoomService"]].mean().reset_index().sort_values(["RoomService"], ascending=False)
fig = px.bar(df2, x='HomePlanet', y='RoomService',  height=400, text_auto= True)

fig.show()



In [None]:
df2 = df.groupby(["HomePlanet"])[["ShoppingMall"]].mean().reset_index().sort_values(["ShoppingMall"], ascending=False)
fig = px.bar(df2, x='HomePlanet', y='ShoppingMall', height=400, text_auto= True,color_discrete_sequence=px.colors.qualitative.Bold)
fig.show()

In [None]:
df2 = df.groupby(["HomePlanet"])[["Spa"]].mean().reset_index().sort_values(["Spa"], ascending=False)
fig = px.bar(df2, x='HomePlanet', y='Spa', height=400, text_auto= True, color_discrete_sequence=px.colors.qualitative.Dark2)
fig.show()

In [None]:
df2 = df.groupby(["HomePlanet"])[["VRDeck"]].mean().reset_index().sort_values(["VRDeck"], ascending=False)
fig = px.bar(df2, x='HomePlanet', y='VRDeck', height=400, text_auto= True,color_discrete_sequence=px.colors.qualitative.Pastel)
fig.show()

In [None]:
fig = px.histogram(df, x="Age",color_discrete_sequence=px.colors.qualitative.T10)
fig.show()

In [None]:
df[['first_name', 'last_name']] = df['Name'].str.split(pat = ' ', expand = True)
df

In [None]:
df2 = df.groupby("first_name")[["Transported"]].count().reset_index().sort_values(["Transported"], ascending=False).head(10)
fig = px.bar(df2, x="first_name", y="Transported", color_discrete_sequence=px.colors.qualitative.Pastel2, text_auto= True)
fig.show()

In [None]:
df2 = df.groupby("last_name")[["Transported"]].count().reset_index().sort_values(["Transported"], ascending=False).head(10)
fig = px.bar(df2, x="last_name", y="Transported", color_discrete_sequence=px.colors.qualitative.Pastel1, text_auto= True)
fig.show()


In [None]:
df = df.drop(["Name", "first_name", "last_name"],axis = 1)

In [None]:
df.select_dtypes(exclude=np.number).columns

In [None]:
for coluna in df.select_dtypes(exclude=np.number).columns:
    
    print(dict(enumerate(df[coluna].astype("category").cat.categories)))
        
    df[coluna] = df[coluna].astype("category").cat.codes
    
    #print("\n\n")

In [None]:
classifiers = {'Logistic Regression' : LogisticRegression(),
               'KNN': KNeighborsClassifier(),
               'Decision Tree': DecisionTreeClassifier(),
               'Random Forest': RandomForestClassifier(),
               'AdaBoost': AdaBoostClassifier()}
samplers = {'Random_under_sampler': RandomUnderSampler(),
            'Random_over_sampler': RandomOverSampler()}


In [None]:
def df_split(df, target='TARGET'):
    df = df.fillna(999)
    x = df.drop('Transported', axis=1)
    y = df['Transported']    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)                          
    return x_train, x_test, y_train, y_test

In [None]:
def train_clfs(df, classifiers, samplers):
    
    x_train, x_test, y_train, y_test = df_split(df)
    
    names_samplers = []
    names_clfs = []
    results_train_cv_roc_auc = []
    results_train_cv_recall = []
    results_train_cv_accuracy = []
    results_test_roc_auc = []
    results_test_recall = []
    results_test_accuracy = []
    modelos = []
    
    for name_sampler, sampler in samplers.items():
        print(f'Sampler: {name_sampler}\n')
        for name_clf, clf in classifiers.items():
            print(f'Classifier: {name_clf}\n')
            
            pipeline = Pipeline([('sampler', sampler),
                                 ('clf', clf)])
            
            cv_auc = cross_val_score(pipeline, x_train, y_train, cv=10, scoring='roc_auc') 
            cv_rec = cross_val_score(pipeline, x_train, y_train, cv=10, scoring='recall')                                
            cv_acc = cross_val_score(pipeline, x_train, y_train, cv=10, scoring='accuracy')        

            pipeline.fit(x_train, y_train)
            modelos.append(pipeline)
            y_pred = pipeline.predict(x_test)
            
            names_samplers.append(name_sampler)
            names_clfs.append(name_clf)
            results_train_cv_roc_auc.append(cv_auc)
            results_train_cv_recall.append(cv_rec)
            results_train_cv_accuracy.append(cv_acc)
            results_test_roc_auc.append(roc_auc_score(y_test, y_pred))
            results_test_recall.append(recall_score(y_test, y_pred))
            results_test_accuracy.append(accuracy_score(y_test, y_pred))

            print(f'CV\t-\troc_auc:\t{round(cv_auc.mean(), 3)}')
            print(f'CV\t-\trecall:\t\t{round(cv_rec.mean(), 3)}')
            print(f'CV\t-\taccuracy:\t{round(cv_acc.mean(), 3)}')

            print(f'Test\t-\troc_auc:\t{round(roc_auc_score(y_test, y_pred), 3)}')         
            print(f'Test\t-\trecall:\t\t{round(recall_score(y_test, y_pred), 3)}')          
            print(f'Test\t-\taccuracy:\t{round(accuracy_score(y_test, y_pred), 3)}')      
            print('\n<-------------------------->\n')

    df_results_test = pd.DataFrame(index=[names_clfs, names_samplers], columns=['ROC_AUC', 'RECALL', 'ACCURACY'])
    df_results_test['ROC_AUC'] = results_test_roc_auc
    df_results_test['RECALL'] = results_test_recall
    df_results_test['ACCURACY'] = results_test_accuracy

    return df_results_test, modelos

In [None]:
df_results_test, modelos_pipeline = train_clfs(df, classifiers, samplers)

In [None]:
modelos_pipeline

In [None]:
df_test.isnull().sum()

In [None]:
df_test = df_test.replace(np.nan,0)

In [None]:
df_test.isnull().sum()

In [None]:
for coluna in df_test.select_dtypes(exclude=np.number).columns:
    
    print(dict(enumerate(df_test[coluna].astype("category").cat.categories)))
        
    df_test[coluna] = df_test[coluna].astype("category").cat.codes
    
    #print("\n\n")

In [None]:
modelo_final = modelos_pipeline[9]

df4 = df.drop(columns=['Transported'])

teste_s_na = df_test[df4.columns].dropna()

pred_proba = modelo_final.predict(teste_s_na)

In [None]:
pred_proba

In [None]:
submission = pd.DataFrame({
        "PassengerId": df_sub["PassengerId"],
        "Transported":  pred_proba})

In [None]:
submission = pd.DataFrame({
        "PassengerId": df_sub["PassengerId"],
        "Transported":  pred_proba})

submission['Transported'] = submission['Transported'].astype('bool')
submission.to_csv('submission.csv', index=False)
submission