In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pycaret


In [None]:
import gc 
gc.collect()

In [None]:
import pandas as pd
from pycaret.classification import *

In [None]:
def create_group_features(train_data):
     train_data = (train_data.groupby('PassengerGroup', as_index = False)
          .agg({'PassengerNo':'nunique',
                'VIP':lambda x: sum(x == True),
                'CryoSleep': lambda x: sum(x == True),
                'Cabin': 'nunique',
                'Deck': 'nunique',
                'Side': 'nunique',
                'Age': 'mean',
                'RoomService': 'mean',
                'FoodCourt': 'mean',
                'ShoppingMall':'mean',
                'Spa':'mean',
                'VRDeck': 'mean',
                'TotalSpend':'mean',
                'HomePlanet': 'nunique'})
          .rename(columns = {'PassengerNo':'Count'})
         )
    
     train_data['PctRoomService'] = train_data['RoomService']/train_data['TotalSpend']
     train_data['PctFoodCourt'] = train_data['FoodCourt']/train_data['TotalSpend']
     train_data['PctShoppingMall'] = train_data['ShoppingMall']/train_data['TotalSpend']
     train_data['PctSpa'] = train_data['Spa']/train_data['TotalSpend']
     train_data['PctVRDeck'] = train_data['VRDeck']/train_data['TotalSpend']
    
     fill_cols = ['PctRoomService', 'PctFoodCourt', 'PctShoppingMall', 'PctSpa', 'PctVRDeck']
     train_data[fill_cols] = train_data[fill_cols].fillna(0)
    
     train_data.columns = [f'Group{i}' if i not in ['PassengerGroup'] else i for i in train_data.columns]
    
     return train_data

In [None]:
def create_features(train_data):
    
    bool_type = ['VIP', 'CryoSleep']
    train_data[bool_type] = train_data[bool_type].astype(bool)
    
    train_data['PassengerGroup'] = train_data['PassengerId'].apply(lambda x: x.split('_')[0])
    train_data['PassengerNo'] = train_data['PassengerId'].apply(lambda x: x.split('_')[1])
    train_data.loc[train_data['Cabin'].isnull(), 'Cabin'] = 'None/None/None'
    
    fill_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    train_data[fill_cols] = train_data[fill_cols].fillna(0)
    train_data['TotalSpend'] = train_data['RoomService'] + train_data['FoodCourt'] + train_data['ShoppingMall'] + train_data['Spa'] + train_data['VRDeck']
    train_data['PctRoomService'] = train_data['RoomService']/train_data['TotalSpend']
    train_data['PctFoodCourt'] = train_data['FoodCourt']/train_data['TotalSpend']
    train_data['PctShoppingMall'] = train_data['ShoppingMall']/train_data['TotalSpend']
    train_data['PctSpa'] = train_data['Spa']/train_data['TotalSpend']
    train_data['PctVRDeck'] = train_data['VRDeck']/train_data['TotalSpend']
    fill_cols = ['PctRoomService', 'PctFoodCourt', 'PctShoppingMall', 'PctSpa', 'PctVRDeck']
    train_data[fill_cols] = train_data[fill_cols].fillna(0)
    train_data['Age'] = train_data['Age'].fillna(train_data.groupby('HomePlanet')['Age'].transform('median'))
    train_data['CryoSleep'] = train_data['CryoSleep'].fillna(False)
    
    train_data['Deck'] = train_data['Cabin'].apply(lambda x: str(x).split('/')[0])
    train_data['Side'] = train_data['Cabin'].apply(lambda x: str(x).split('/')[2])
    
    df_group_features = create_group_features(train_data)      
    
    train_data = pd.merge(train_data, df_group_features, on = 'PassengerGroup', how = 'left')
    
    return train_data


In [None]:
train=pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test=pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

In [None]:
data=create_features(train)
data1=create_features(test)
data

In [None]:
data.describe(include='all')


In [None]:
pip install -U pandas-profiling


In [None]:
import missingno
missingno.matrix(data)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 
sw = sns.factorplot(x = 'GroupDeck', y = 'GroupSide', hue = 'HomePlanet', data = data, kind = 'bar',palette='Set2',size=7.5)
sw.despine(left = False)
plt.ylabel('cabin side ')

In [None]:
sns.countplot(x='GroupVIP',data=data,palette="mako")


In [None]:
def reduce_mem_usage(data=data, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = data.memory_usage().sum() / 1024**2
    for col in data.columns:
        col_type = data[col].dtypes
        if col_type in numerics:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:10] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)

    end_mem = data.memory_usage().sum() / 1024**2 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))


In [None]:
import gc
gc.collect()

In [None]:
# data.isnull().sum()

In [None]:
data.dtypes


In [None]:
num_cols = list(data.select_dtypes('float64').columns) + list(data.select_dtypes('int64').columns) 
clf=setup(data=data,target='Transported',imputation_type='iterative',train_size = 0.99,
          fold_strategy = 'stratifiedkfold',
          fold = 5,
          fold_shuffle = True,
          numeric_features = num_cols,
          ignore_low_variance=True,
          remove_multicollinearity = True,
          normalize = True,
          normalize_method = 'robust',
          data_split_stratify = True,
          
          ignore_features = ['PassengerNo', 'Name', 'PassengerId', 'PassengerGroup', 'Cabin'],
          silent = True)
 #use_gpu=True

In [None]:
data.isnull().sum()


In [None]:
data.dtypes


In [None]:
import gc 
gc.collect()

In [None]:
data.info(memory_usage='deep')


In [None]:
def reduce_mem_usage(data):
    """ iterate through all the columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    start_mem = data.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in data.columns:
        col_type = data[col].dtype

        if col_type != object:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:14] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)
        else:
            data[col] = data[col].astype('category')

    end_mem = data.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return data
import pandas as pd
import numpy as np
import gc
data  = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
data= reduce_mem_usage(data)
gc.collect()

In [None]:
data.isnull().sum()


In [None]:
models()


In [None]:
best_model = compare_models(n_select = 4,include =['catboost','lightgbm','gbc'])    
#The Kappa statistic (or value) is a metric that compares an Observed Accuracy with an Expected Accuracy (random chance).

In [None]:
catboost = tune_model(create_model('catboost'), choose_better = True, n_iter = 10)


In [None]:
pipe=predict_model(catboost,data1)
pipe

In [None]:
submit_1=pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')


In [None]:
df_sub = pipe.loc[:, ['PassengerId', 'Label']].rename(columns = {'Label':'Transported'})
df_sub.to_csv('submission.csv', index = False)

In [None]:
df_sub.PassengerId
