In [19]:
%cd /content/drive/MyDrive/Hackathon/Hackathon

/content/drive/MyDrive/Hackathon/Hackathon


In [20]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import pandas as pd
pd.set_option('precision', 3)

# Data Visualisation Libraries
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'

!pip install seaborn --upgrade
import seaborn as sns
sns.set_style('darkgrid')

# Statistics
from scipy.stats import chi2_contingency
from imblearn.over_sampling import SMOTE

# Machine Learning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import learning_curve

from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, recall_score, precision_score, auc, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [21]:
train_df = pd.read_csv('/content/drive/MyDrive/Hackathon/Hackathon/processed_data/MergedTrainData_NaNMean.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Hackathon/Hackathon/processed_data/MergedTestData_NanMean.csv')
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94379 entries, 0 to 94378
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Gender                  94379 non-null  int64  
 1   Customer_Type           94379 non-null  float64
 2   Age                     94379 non-null  float64
 3   Travel_Class            94379 non-null  int64  
 4   Travel_Distance         94379 non-null  int64  
 5   Arrival_Delay_in_Mins   94379 non-null  float64
 6   Overall_Experience      94379 non-null  int64  
 7   Seat_Comfort            94379 non-null  float64
 8   Catering                94379 non-null  float64
 9   Platform_Location       94379 non-null  float64
 10  Onboard_Wifi_Service    94379 non-null  float64
 11  Onboard_Entertainment   94379 non-null  float64
 12  Online_Support          94379 non-null  float64
 13  Ease_of_Online_Booking  94379 non-null  float64
 14  Onboard_Service         94379 non-null

In [22]:
train_df.isnull().sum()

Gender                    0
Customer_Type             0
Age                       0
Travel_Class              0
Travel_Distance           0
Arrival_Delay_in_Mins     0
Overall_Experience        0
Seat_Comfort              0
Catering                  0
Platform_Location         0
Onboard_Wifi_Service      0
Onboard_Entertainment     0
Online_Support            0
Ease_of_Online_Booking    0
Onboard_Service           0
Legroom                   0
Baggage_Handling          0
CheckIn_Service           0
Cleanliness               0
Online_Boarding           0
dtype: int64

In [23]:
test_df.isnull().sum()

Gender                    0
Customer_Type             0
Age                       0
Travel_Class              0
Travel_Distance           0
Arrival_Delay_in_Mins     0
Seat_Comfort              0
Catering                  0
Platform_Location         0
Onboard_Wifi_Service      0
Onboard_Entertainment     0
Online_Support            0
Ease_of_Online_Booking    0
Onboard_Service           0
Legroom                   0
Baggage_Handling          0
CheckIn_Service           0
Cleanliness               0
Online_Boarding           0
dtype: int64

In [24]:
train_df.shape

(94379, 20)

In [25]:
from sklearn import model_selection

#add extra one columns
train_df['kfold']=-1
#Distributing the data 5 shares
kfold = model_selection.KFold(n_splits=10, shuffle= True, random_state = 12)
for fold, (train_indicies, valid_indicies) in enumerate(kfold.split(X=train_df)):
    #print(fold,train_indicies,valid_indicies)
    train_df.loc[valid_indicies,'kfold'] = fold

    
print(train_df.kfold.value_counts()) #total data 300000 = kfold split :5 * 60000

#output of train folds data
train_df.to_csv("trainfold_10.csv",index=False)

0    9438
3    9438
8    9438
4    9438
5    9438
6    9438
2    9438
1    9438
7    9438
9    9437
Name: kfold, dtype: int64


In [26]:
#import the data and shape
train = pd.read_csv("/content/drive/MyDrive/Hackathon/Hackathon/trainfold_10.csv")
test = pd.read_csv('/content/drive/MyDrive/Hackathon/Hackathon/processed_data/MergedTestData_NanMean.csv')
sample_submission = pd.read_csv("/content/drive/MyDrive/Hackathon/Hackathon/Sample_Submission_(1).csv")
#preview the train data
print(train.shape,test.shape)
train.sample(3)

(94379, 21) (35602, 19)


Unnamed: 0,Gender,Customer_Type,Age,Travel_Class,Travel_Distance,Arrival_Delay_in_Mins,Overall_Experience,Seat_Comfort,Catering,Platform_Location,...,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding,kfold
58848,0,0.0,68.0,0,1857,0.0,1,3.0,3.0,3.0,...,3.0,1.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0,8
10476,1,0.0,26.0,1,2787,17.0,0,3.0,2.0,3.0,...,2.0,5.0,5.0,5.0,4.0,2.0,3.0,1.0,5.0,4
3221,0,0.0,40.0,0,2694,0.0,1,4.0,4.0,4.0,...,4.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,6


In [27]:
train.columns

Index(['Gender', 'Customer_Type', 'Age', 'Travel_Class', 'Travel_Distance',
       'Arrival_Delay_in_Mins', 'Overall_Experience', 'Seat_Comfort',
       'Catering', 'Platform_Location', 'Onboard_Wifi_Service',
       'Onboard_Entertainment', 'Online_Support', 'Ease_of_Online_Booking',
       'Onboard_Service', 'Legroom', 'Baggage_Handling', 'CheckIn_Service',
       'Cleanliness', 'Online_Boarding', 'kfold'],
      dtype='object')

In [32]:
from sklearn.preprocessing import OrdinalEncoder,RobustScaler
from sklearn.metrics import mean_squared_error

#store the final_prediction data and score
final_predictions = []
score= []

#features(categorical and numerical datas separate)
useful_features = [c for c in train.columns if c not in ("Overall_Experience","kfold","Travel_Class", "Onboard_Wifi_Service", "Platform_Location", "Baggage_Handling", "Legroom")]
numerical_cols = ['Gender', 'Customer_Type', 'Age','Travel_Distance',
       'Arrival_Delay_in_Mins','Seat_Comfort','Catering',
       'Onboard_Entertainment', 'Online_Support', 'Ease_of_Online_Booking','Onboard_Service', 'CheckIn_Service',
       'Cleanliness', 'Online_Boarding']
test = test[useful_features]

for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    xtest = test.copy()
    
    ytrain = xtrain.Overall_Experience
    yvalid = xvalid.Overall_Experience
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    scaler = RobustScaler()
    xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = scaler.transform(xvalid[numerical_cols])
    xtest[numerical_cols] = scaler.transform(xtest[numerical_cols])
    
    #Model hyperparameter of XGboostclassifier major parameters
    xgb_params = {
        'learning_rate': 0.001356,
        'max_depth': 6,
        'random_state':12,
        'n_estimators':30000
        }
    
    model= XGBClassifier(**xgb_params,
                       tree_method='gpu_hist',
                       predictor='gpu_predictor',
                       gpu_id=0)
    model.fit(xtrain,ytrain,early_stopping_rounds=1000,eval_set=[(xvalid,yvalid)],verbose=2000)
    preds_valid = model.predict(xvalid)
    
    #Training model apply the test data and predict the output
    test_pre = model.predict(xtest)
    final_predictions.append(test_pre)
    
    #Rootmeansquared output
    rms = mean_squared_error(yvalid,preds_valid,squared=False)
    
    score.append(rms)
    #way of output is display
    print(f"fold:{fold},rmse:{rms}")

#mean of repeation of fold data and identify the  mean and standard deviation 
print(np.mean(score),np.std(score))

[0]	validation_0-error:0.110405
Will train until validation_0-error hasn't improved in 1000 rounds.
[2000]	validation_0-error:0.089956
[4000]	validation_0-error:0.083916
[6000]	validation_0-error:0.076287
[8000]	validation_0-error:0.075122
[10000]	validation_0-error:0.072155
[12000]	validation_0-error:0.071096
Stopping. Best iteration:
[12576]	validation_0-error:0.070566

fold:0,rmse:0.26564223654856756
[0]	validation_0-error:0.109239
Will train until validation_0-error hasn't improved in 1000 rounds.
[2000]	validation_0-error:0.094088
[4000]	validation_0-error:0.084764
[6000]	validation_0-error:0.078618
[8000]	validation_0-error:0.073533
[10000]	validation_0-error:0.070566
Stopping. Best iteration:
[9673]	validation_0-error:0.070248

fold:1,rmse:0.26504326794751365
[0]	validation_0-error:0.112948
Will train until validation_0-error hasn't improved in 1000 rounds.
[2000]	validation_0-error:0.093982
[4000]	validation_0-error:0.082962
[6000]	validation_0-error:0.077029
[8000]	validation_

In [35]:
#prediction of data
preds = np.mean(np.column_stack(final_predictions),axis=1)
print(preds)
sample_submission.Overall_Experience = preds
sample_submission.to_csv("submission1.csv",index=False)
print("success")

[1. 1. 1. ... 0. 1. 0.]
success


In [34]:
sample_submission.head(10)

Unnamed: 0,ID,Overall_Experience
0,99900001,1.0
1,99900002,1.0
2,99900003,1.0
3,99900004,0.0
4,99900005,1.0
5,99900006,0.8
6,99900007,0.0
7,99900008,1.0
8,99900009,0.0
9,99900010,0.0
