In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
df_test=pd.read_csv('../input/spaceship-titanic/test.csv')
df_test1=df_test.copy()
df_test.head()

In [None]:
# checking null data present
df_test.isnull().sum().sort_values(ascending=False)

In [None]:
df_train=pd.read_csv('../input/spaceship-titanic/train.csv')
df_train1=df_train.copy()
df_train.head()

In [None]:
# checking into train dataset
df_train.isnull().sum().sort_values(ascending=False) # checking total missing data

In [None]:
df_train.describe()

In [None]:
y=df_train['Transported'] # taking out transported column from train dataset for future use
y.unique()

**Overview of Data**

In [None]:
df_train.drop(['PassengerId'],axis=1,inplace=True)
df_test.drop(['PassengerId'],axis=1,inplace=True)
Target='Transported'
Features=[col for col in df_train.columns if col!=Target]
RANDOM_STATE=12

In [None]:
df_train.iloc[:, :-1].describe().T.sort_values(by='std' , ascending = False)\
                     .style.background_gradient(cmap='GnBu')\
                     .bar(subset=["max"], color='teal')\
                     .bar(subset=["mean",], color='green')

**Null value distribution**

In [None]:
# for train data set
plt.figure(figsize=(20, 10))

cmap = sns.cubehelix_palette(light=1, as_cmap=True, reverse=True)
sns.heatmap(df_train.isnull(), cmap=cmap)

In [None]:
# for test data set
plt.figure(figsize=(20, 10))

cmap = sns.cubehelix_palette(light=1, as_cmap=True, reverse=True)
sns.heatmap(df_test.isnull(), cmap=cmap)

In [None]:
df = pd.concat([df_train[Features], df_test[Features]], axis=0) # Merging train and test dataset
text_features=['Cabin','Name']
cat_features=[col for col in Features if df[col].nunique() < 25 and col not in text_features ]
cont_features = [col for col in Features if df[col].nunique() >= 25 and col not in text_features]

del df
print('Total number of features: ',len(Features))
print('Number of categorical features:',len(cat_features))
print('Number of continuos features:',len(cont_features))
print('Number of text features:',len(text_features))

explode=(0.1,0.01,0.01)
labels=['Categorical', 'Continuos', "Text"]
values= [len(cat_features), len(cont_features), len(text_features)]
colors = ['#DE3163', 'yellow','teal']
plt.pie(x=values,labels=labels,autopct='%1.1f%%',colors=colors,explode=explode)
plt.show()

**Features distribution of continous type data**

In [None]:
# Distribution of age
train_age=df_train.copy()
test_age =df_test.copy()
train_age["type"] = "Train"
test_age["type"] = "Test"
ageDf = pd.concat([train_age, test_age])

fig=px.histogram(data_frame=ageDf,x='Age',color='type',color_discrete_sequence=['teal','red'],marginal='box',nbins=100,template="plotly_white")
fig.update_layout(title='Distribution of Age',title_x=0.5)
fig.show()

**Target Distribution**
  Observations in Null Value Distribution :

* There are two target values - 0 and 1.
* Both the target values are almost equally distributed.

In [None]:
df_train['Transported'].value_counts().plot(kind='bar',color=['cyan','pink'])
plt.xlabel('Transported')
plt.ylabel('Count')
plt.title('Target Distribution')
plt.show()
df_train.groupby('Transported').Transported.count()

In [None]:
x=df_train.shape[0]
print('Percentage of Transported = False-->',(4315/x)*100,'\n')
print('Percentage of Transported = True -->',(4378/x)*100,'\n')

**Corelation matrix**

In [None]:
fig = px.imshow(df_train.corr() ,text_auto=True, aspect="auto" , color_continuous_scale = "viridis")
fig.show()

Seeing this we can say data doesnot have multicollinearity

**Before handling missing values we are going to concate the two dataset ie Train and test into one**

In [None]:
df_train.drop(columns=['Transported'],axis=1,inplace=True)
df_mix = pd.concat([df_train,df_test],ignore_index=True)
print('Shape',df_mix.shape)
df_mix.head()

## Handling missing values

* Age

In [None]:
df_mix['Age'].mean() # mean age of all passenger

In [None]:
df_mix.groupby('HomePlanet').Age.mean() # checking mean age of different planets 

In [None]:
for planets in df_mix['HomePlanet']:
    if planets=='Earth':
        df_mix['Age'].fillna(26, inplace=True)
    elif planets=='Europa':
        df_mix['Age'].fillna(34, inplace=True)
    else:
        df_mix['Age'].fillna(29, inplace=True)

In [None]:
plt.figure(figsize=(20, 10))
cmap = sns.cubehelix_palette(light=1, as_cmap=True, reverse=True)
sns.heatmap(df_mix.isnull(), cmap=cmap)

* VIP

In [None]:
df_vip=df_mix.loc[:,['VIP','CryoSleep','Cabin','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']]
df_vip.groupby('VIP').agg({'RoomService':'mean','FoodCourt':'mean','ShoppingMall':'mean','Spa':'mean','VRDeck':'mean'})

In [None]:
m=df_vip.loc[(df_vip.RoomService<486)|(df_vip.FoodCourt<1794)|(df_vip.ShoppingMall<273)|(df_vip.Spa<932)|(df_vip.VRDeck<1207)]

In [None]:
m['VIP'].isnull().sum() # Here we can see that after selecting vip's from mean value it gives total 296 null values hence we consider them all as non vip member

In [None]:
# Checking vip null value in mix dataset
df_mix.VIP.isnull().sum()

In [None]:
df_mix['VIP'].fillna('False', inplace=True) # filling missing value for VIP as False
plt.figure(figsize=(20, 10))

cmap = sns.cubehelix_palette(light=1, as_cmap=True, reverse=True)
sns.heatmap(df_mix.isnull(), cmap=cmap)

* Missing value for room service,Spa,Shopping mall,Food court,VRdeck

In [None]:
df_mix['RoomService'].isnull().sum()

In [None]:
df_mix.groupby(['HomePlanet']).agg({'RoomService':'mean','FoodCourt':'mean','ShoppingMall':'mean','Spa':'mean','VRDeck':'mean'})

In [None]:
for planets in df_mix['HomePlanet']:
    if planets=='Earth':
        df_mix['RoomService'].fillna(136, inplace=True)
        df_mix['FoodCourt'].fillna(137, inplace=True)
        df_mix['ShoppingMall'].fillna(133, inplace=True)
        df_mix['Spa'].fillna(142, inplace=True)
        df_mix['VRDeck'].fillna(138, inplace=True)
    elif planets=='Europa':
        df_mix['RoomService'].fillna(145, inplace=True)
        df_mix['FoodCourt'].fillna(1503, inplace=True)
        df_mix['ShoppingMall'].fillna(151, inplace=True)
        df_mix['Spa'].fillna(848, inplace=True)
        df_mix['VRDeck'].fillna(875, inplace=True)
    else:
        df_mix['RoomService'].fillna(552, inplace=True)
        df_mix['FoodCourt'].fillna(54, inplace=True)
        df_mix['ShoppingMall'].fillna(308, inplace=True)
        df_mix['Spa'].fillna(110, inplace=True)
        df_mix['VRDeck'].fillna(47, inplace=True)

In [None]:
plt.figure(figsize=(20, 10))

cmap = sns.cubehelix_palette(light=1, as_cmap=True, reverse=True)
sns.heatmap(df_mix.isnull(), cmap=cmap)

**We will drop name since it will not help us in decide which passenger is transported or not.**

In [None]:
df_mix.drop(columns=['Name'],axis=1,inplace=True)

* Missing value for Cyrosleep

In [None]:
for ser in df_mix['RoomService']:
    if ser==0:
        df_mix['CryoSleep'].fillna('True', inplace=True)
# As cyro sleep member will not use room service

plt.figure(figsize=(20, 10))

cmap = sns.cubehelix_palette(light=1, as_cmap=True, reverse=True)
sns.heatmap(df_mix.isnull(), cmap=cmap)

In [None]:
only_object = df_mix.dtypes[df_mix.dtypes == 'object']
only_object.index
for i in only_object.index:
    print(i , end = ' ')
    print(df_mix[i].nunique())

In [None]:
#filling the object value with mode
for i in only_object.index :
    print(i ,df_mix[i].mode()[0])
    df_mix[i] = df_mix[i].fillna(df_mix[i].mode()[0])

In [None]:
df_mix.isnull().sum()

**All missing value removed**

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import time
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_mix.drop(columns=['Cabin'],axis=1,inplace=True) # considering Cabin not required


#converting boolean value int 0,1 format
cat_bool=['CryoSleep','VIP']
for i in cat_bool:
    df_mix[i]=df_mix[i].map({True:1,False:0})

In [None]:
df_mix.dtypes

In [None]:
# changing all object data types into int64 format
cat_val = ['HomePlanet' , 'Destination','CryoSleep' , 'VIP']
from sklearn.preprocessing import LabelEncoder
for i in cat_val:
    le = LabelEncoder()
    df_mix[i] = le.fit_transform(df_mix[i])
    

    
    
    
print(df_mix.dtypes)
df_mix.head(20)

In [None]:
# Again seperating the two dataset
print(len(df_train1))
train = df_mix.iloc[:len(df_train1) , :]
test = df_mix.iloc[len(df_train1): , :]

In [None]:
print(df_train.shape,df_test.shape)

In [None]:
%%time
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from tqdm import tqdm
lgb_params = {
    'objective' : 'binary',
    'n_estimators' :50,
    'learning_rate' : 0.08
}
lgb_predictions = 0
lgbm = LGBMClassifier(**lgb_params)
skf = StratifiedKFold(n_splits = 5 , shuffle = True)
for fold , (train_idx , valid_idx) in tqdm(enumerate(skf.split(train , y))):
    print(10*"=", f"Fold={fold+1}", 10*"=")
    X_train, X_valid = train.iloc[train_idx] , train.iloc[valid_idx]
    y_train , y_valid = y.iloc[train_idx] , y.iloc[valid_idx]
    model = lgbm.fit(X_train , y_train)
    preds_valid = model.predict(X_valid)
    acc = accuracy_score(y_valid,  preds_valid)
    print(f"Fold={fold+1}, Accuracy score: {acc:.2f}%")

In [None]:
prediction = lgbm.predict(test)

In [None]:
df_test1['Transported']=prediction
df_test1.head()

In [None]:
submission=df_test1.loc[:,['PassengerId','Transported']]
submission