In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [5]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [6]:
df1 = df.copy()

In [7]:
df1 =  df1.drop(['Transported'],axis=1)

In [8]:
numerical1 = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
categorical1 = ['HomePlanet','CryoSleep','Destination','VIP']

In [9]:
num_fill = [df1[col].mean() for col in numerical1]

In [10]:
num_fill

[28.82793046746535,
 224.687617481203,
 458.07720329024676,
 173.72916912197996,
 311.1387779083431,
 304.8547912992357]

In [11]:
cat1_fill = [df1[col].value_counts().sort_values(ascending = False).index[0] for col in categorical1]

In [12]:
cat1_fill

['Earth', False, 'TRAPPIST-1e', False]

In [13]:
def fill_missing_values(data):
    for col,fill_val in zip(numerical1,num_fill):
        data[col].fillna(fill_val,inplace=True)
        
    for col,fill_val in zip(categorical1,cat1_fill):
        data[col].fillna(fill_val,inplace=True)
        
    data['Cabin'].fillna('F/82/S',inplace = True)
    
    return data

In [14]:
ft1 = FunctionTransformer(fill_missing_values)

In [15]:
df1 = ft1.fit_transform(df1)

In [16]:
df1.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin             0
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
dtype: int64

In [17]:
def feat_engineering(data):
    data['Cabin1'] = data['Cabin'].apply(lambda x: x[0])
    data['Cabin2'] = data['Cabin'].apply(lambda x: x[-1])
    data['Co_passengers'] = data['PassengerId'].apply(lambda x:x[-2:])
    data.drop(['Cabin','PassengerId','Name'],axis = 1,inplace = True)
    return data

In [18]:
ft2 = FunctionTransformer(feat_engineering)

In [19]:
df1 = ft2.fit_transform(df1)

In [20]:
df1

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cabin1,Cabin2,Co_passengers
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,B,P,01
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,F,S,01
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,A,S,01
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,A,S,02
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,F,S,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,A,P,01
8689,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,G,S,01
8690,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,G,S,01
8691,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,E,S,01


In [21]:
df2 =  df.copy()
df2 =ft1.fit_transform(df2)
df2 = ft2.fit_transform(df2)

In [22]:
dict1 = df2.groupby('Cabin1')['Transported'].mean().to_dict()

In [23]:
dict2 = df2.groupby('Co_passengers')['Transported'].mean().to_dict()

In [24]:
def mean_encoding(data):
    data['Cabin1'] = data['Cabin1'].map(dict1)
    data['Co_passengers'] = data['Co_passengers'].map(dict2)
    return data

In [25]:
ft3= FunctionTransformer(mean_encoding)

In [26]:
df1 = ft3.fit_transform(df1)

In [27]:
df1

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cabin1,Cabin2,Co_passengers
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,0.734275,P,0.475953
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,0.444036,S,0.475953
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,0.496094,S,0.475953
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,0.496094,S,0.558782
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,0.444036,S,0.475953
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,0.496094,P,0.475953
8689,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,0.516217,S,0.475953
8690,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,0.516217,S,0.475953
8691,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,0.357306,S,0.475953


In [28]:
categorical2 = ['HomePlanet','CryoSleep','Destination','VIP','Cabin2']

In [29]:
def one_hot_encoding(data):
    dum = pd.get_dummies(data[categorical2],drop_first=True)
    data.drop(categorical2,axis=1,inplace = True)
    return pd.concat([data,dum],axis=1)

In [30]:
ft4  = FunctionTransformer(one_hot_encoding)

In [31]:
df1 = ft4.fit_transform(df1)

In [32]:
df1

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cabin1,Co_passengers,CryoSleep,VIP,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Cabin2_S
0,39.0,0.0,0.0,0.0,0.0,0.0,0.734275,0.475953,False,False,1,0,0,1,0
1,24.0,109.0,9.0,25.0,549.0,44.0,0.444036,0.475953,False,False,0,0,0,1,1
2,58.0,43.0,3576.0,0.0,6715.0,49.0,0.496094,0.475953,False,True,1,0,0,1,1
3,33.0,0.0,1283.0,371.0,3329.0,193.0,0.496094,0.558782,False,False,1,0,0,1,1
4,16.0,303.0,70.0,151.0,565.0,2.0,0.444036,0.475953,False,False,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,41.0,0.0,6819.0,0.0,1643.0,74.0,0.496094,0.475953,False,True,1,0,0,0,0
8689,18.0,0.0,0.0,0.0,0.0,0.0,0.516217,0.475953,True,False,0,0,1,0,1
8690,26.0,0.0,0.0,1872.0,1.0,0.0,0.516217,0.475953,False,False,0,0,0,1,1
8691,32.0,0.0,1049.0,0.0,353.0,3235.0,0.357306,0.475953,False,False,1,0,0,0,1


In [33]:
pipe = Pipeline([('classifier',XGBClassifier())])

In [34]:
grid_param = [
    {'classifier':[XGBClassifier()],
              'classifier__n_estimators':[50,100,200],
              'classifier__max_depth':[1,3,5],
              'classifier__learning_rate':[0.1,0.3,1]           
    },    
    {'classifier':[LGBMClassifier()],
              'classifier__n_estimators':[50,100,200],
              'classifier__learning_rate':[0.03,0.1,0.3]   
    }
    
]

In [35]:
GScv2 = GridSearchCV(pipe,param_grid=grid_param,cv=5)

In [36]:
pipe = make_pipeline(ft1,ft2,ft3,ft4,GScv2)

In [37]:
pipe.fit(df.drop('Transported',axis =1),df['Transported'])

In [38]:
df_test  = pd.read_csv('test.csv')

In [39]:
df_test1 = df_test.copy()

In [40]:
pred = pipe.predict(df_test)

In [41]:
sub =  pd.DataFrame(df_test1['PassengerId'])

In [42]:
sub['Transported'] = pd.DataFrame(pred,columns = ['Transported']).replace([0,1],[False,True])

In [43]:
sub

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,True
4274,9271_01,True
4275,9273_01,True


In [44]:
sub.to_csv('submissionPipe.csv',index = False)