In [61]:
import numpy as np
from sklearn import datasets, metrics, model_selection, svm, linear_model
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from unicodedata import category
from category_encoders import TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb
import seaborn as sns

Data loading

In [62]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')


Data analysis:

Getting dataframe info:

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42100 entries, 0 to 42099
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   id                                    42100 non-null  int64  
 1   no_of_adults                          42100 non-null  int64  
 2   no_of_children                        42100 non-null  int64  
 3   no_of_weekend_nights                  42100 non-null  int64  
 4   no_of_week_nights                     42100 non-null  int64  
 5   type_of_meal_plan                     42100 non-null  int64  
 6   required_car_parking_space            42100 non-null  int64  
 7   room_type_reserved                    42100 non-null  int64  
 8   lead_time                             42100 non-null  int64  
 9   arrival_year                          42100 non-null  int64  
 10  arrival_month                         42100 non-null  int64  
 11  arrival_date   

In [64]:
df.head()

Unnamed: 0,id,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,0,2,0,0,2,1,0,0,9,2018,1,14,1,1,11,0,67.5,0,0
1,1,2,0,1,2,0,0,0,117,2018,7,29,0,0,0,0,72.25,0,0
2,2,2,0,0,1,0,0,0,315,2018,12,2,0,0,0,0,52.0,0,0
3,3,1,0,0,2,1,0,0,32,2018,12,1,1,0,0,0,56.0,0,0
4,4,2,0,1,0,0,0,0,258,2018,10,16,0,0,0,0,100.0,0,1


In [65]:
df.drop('id', inplace=True, axis=1)
df_test.drop('id', inplace=True, axis=1)



Checking for empty features:

In [66]:
df.isnull().sum()

no_of_adults                            0
no_of_children                          0
no_of_weekend_nights                    0
no_of_week_nights                       0
type_of_meal_plan                       0
required_car_parking_space              0
room_type_reserved                      0
lead_time                               0
arrival_year                            0
arrival_month                           0
arrival_date                            0
market_segment_type                     0
repeated_guest                          0
no_of_previous_cancellations            0
no_of_previous_bookings_not_canceled    0
avg_price_per_room                      0
no_of_special_requests                  0
booking_status                          0
dtype: int64

In [67]:
X_train, y_train = df.iloc[:,:-1], df.iloc[:, -1]
X_test = df_test.iloc[:,:]

Features coding

In [68]:
preprocessor = ColumnTransformer(transformers=[
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), ['market_segment_type', 'room_type_reserved', 'type_of_meal_plan', 'arrival_month', 'arrival_date']),
    ('scaler', StandardScaler(), ['lead_time', 'arrival_date', 'avg_price_per_room']),

])

Pipeline creating

In [69]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', linear_model.LogisticRegression(class_weight='balanced', max_iter=1000))

])


Pipeline fitting

In [70]:
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('one_hot_encoder', ...), ('scaler', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


Prediction

In [71]:
pipeline.predict(X_test)


array([0, 0, 1, ..., 0, 0, 1], shape=(28068,))

Submission

In [72]:
df_submission = pd.read_csv('sample_submission.csv', index_col=False)
df_submission['booking_status'] = pipeline.predict_proba(X_test)[:, 1]
df_submission.columns = ['id', 'booking_status']
df_submission.to_csv('submission.csv', index=False)