In [191]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import optuna
import lightgbm as lgb

In [192]:
mobility_train = pd.read_csv('train_Wc8LBpr.csv')
mobility_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131662 entries, 0 to 131661
Data columns (total 14 columns):
Trip_ID                        131662 non-null object
Trip_Distance                  131662 non-null float64
Type_of_Cab                    111452 non-null object
Customer_Since_Months          125742 non-null float64
Life_Style_Index               111469 non-null float64
Confidence_Life_Style_Index    111469 non-null object
Destination_Type               131662 non-null object
Customer_Rating                131662 non-null float64
Cancellation_Last_1Month       131662 non-null int64
Var1                           60632 non-null float64
Var2                           131662 non-null int64
Var3                           131662 non-null int64
Gender                         131662 non-null object
Surge_Pricing_Type             131662 non-null int64
dtypes: float64(5), int64(4), object(5)
memory usage: 14.1+ MB


In [193]:
mobility_train.head()

Unnamed: 0,Trip_ID,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender,Surge_Pricing_Type
0,T0005689460,6.77,B,1.0,2.42769,A,A,3.905,0,40.0,46,60,Female,2
1,T0005689461,29.47,B,10.0,2.78245,B,A,3.45,0,38.0,56,78,Male,2
2,T0005689464,41.58,,10.0,,,E,3.50125,2,,56,77,Male,2
3,T0005689465,61.56,C,10.0,,,A,3.45375,0,,52,74,Male,3
4,T0005689467,54.95,C,10.0,3.03453,B,A,3.4025,4,51.0,49,102,Male,2


In [194]:
mobility_test = pd.read_csv('test_VsU9xXK.csv')
mobility_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87395 entries, 0 to 87394
Data columns (total 13 columns):
Trip_ID                        87395 non-null object
Trip_Distance                  87395 non-null float64
Type_of_Cab                    74237 non-null object
Customer_Since_Months          83429 non-null float64
Life_Style_Index               74068 non-null float64
Confidence_Life_Style_Index    74068 non-null object
Destination_Type               87395 non-null object
Customer_Rating                87395 non-null float64
Cancellation_Last_1Month       87395 non-null int64
Var1                           40606 non-null float64
Var2                           87395 non-null int64
Var3                           87395 non-null int64
Gender                         87395 non-null object
dtypes: float64(5), int64(3), object(5)
memory usage: 8.7+ MB


In [195]:
mobility_train['Type_of_Cab'].unique()

array(['B', nan, 'C', 'E', 'D', 'A'], dtype=object)

In [196]:
mobility_train['Destination_Type'].unique()

array(['A', 'E', 'B', 'C', 'G', 'D', 'F', 'K', 'L', 'H', 'I', 'J', 'M',
       'N'], dtype=object)

In [197]:
mobility_train['Confidence_Life_Style_Index'].unique()

array(['A', 'B', nan, 'C'], dtype=object)

In [198]:
type_of_cab = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
encode = {'Type_of_Cab': {'A': 0,'B': 1,'C': 2,'D': 3,'E' : 4},
          'Confidence_Life_Style_Index' : {'A': 0,'B': 1,'C': 2},
          'Gender':{'Male': 0,'Female': 1}
         }
          
mobility_train.replace(encode,inplace=True)
mobility_test.replace(encode,inplace=True)

In [199]:
for type in ['Destination_Type']:
    lbl = LabelEncoder()
    lbl.fit(list(mobility_train[type].values) + list(mobility_test[type].values)) 
    mobility_train[type] = lbl.transform(list(mobility_train[type].values))
    mobility_test[type] = lbl.transform(list(mobility_test[type].values))

# mobility_train['Surge_Pricing_Type'] = mobility_train['Surge_Pricing_Type'] - 1

In [200]:
mobility_train['Surge_Pricing_Type'].unique()
mobility_train['Surge_Pricing_Type'] = mobility_train['Surge_Pricing_Type'] -1

In [201]:
mobility_train.head()

Unnamed: 0,Trip_ID,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender,Surge_Pricing_Type
0,T0005689460,6.77,1.0,1.0,2.42769,0.0,0,3.905,0,40.0,46,60,1,1
1,T0005689461,29.47,1.0,10.0,2.78245,1.0,0,3.45,0,38.0,56,78,0,1
2,T0005689464,41.58,,10.0,,,4,3.50125,2,,56,77,0,1
3,T0005689465,61.56,2.0,10.0,,,0,3.45375,0,,52,74,0,2
4,T0005689467,54.95,2.0,10.0,3.03453,1.0,0,3.4025,4,51.0,49,102,0,1


In [202]:
mobility_test.head()

Unnamed: 0,Trip_ID,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender
0,T0005689459,9.44,0.0,10.0,2.57438,0.0,1,3.68,2,,46,63,0
1,T0005689462,32.15,1.0,10.0,2.85143,0.0,0,1.59375,0,65.0,49,80,1
2,T0005689463,10.38,2.0,4.0,2.7053,0.0,3,4.505,0,,47,74,0
3,T0005689466,14.94,,6.0,2.48159,2.0,4,4.53,0,63.0,43,54,0
4,T0005689468,32.03,1.0,7.0,2.81598,0.0,10,4.60125,3,96.0,44,56,0


In [203]:
# X = mobility_train.iloc[:,1:-1]
# y = mobility_train.iloc[:,-1]

X = mobility_train.iloc[:,1:-1]
y = mobility_train.iloc[:,-1]

X = X.fillna(-999)
y = y.fillna(-999)
# X['Type_of_Cab'] = X['Type_of_Cab'].astype('category')
X.head()

Unnamed: 0,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender
0,6.77,1.0,1.0,2.42769,0.0,0,3.905,0,40.0,46,60,1
1,29.47,1.0,10.0,2.78245,1.0,0,3.45,0,38.0,56,78,0
2,41.58,-999.0,10.0,-999.0,-999.0,4,3.50125,2,-999.0,56,77,0
3,61.56,2.0,10.0,-999.0,-999.0,0,3.45375,0,-999.0,52,74,0
4,54.95,2.0,10.0,3.03453,1.0,0,3.4025,4,51.0,49,102,0


In [220]:
def classification_model_lgb_multi(X, y,SEED=123):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=SEED)
    print(X_train.head())
    model = xgb_classifier_multi(X_train,y_train,SEED)
#     y_pred_train = model.predict(X_train)
#     accuracy_train = accuracy_score(y_train,y_pred_train)
#     print(accuracy_train)
    y_pred = model.predict(X_test)
    print(y_pred)
    predictions2 = []
    for x in y_pred:
        predictions2.append(np.argmax(x))
    accuracy = accuracy_score(y_test,predictions2)
    print(accuracy)
    return model

def lgb_classifier_multi(X_train,y_train,SEED):
    params = {}
    params['num_leaves'] = 30
    params['num_class'] = 3
    params['learning_rate'] = 0.12
    params['objective'] = 'multiclass'
    params['boosting_type'] = 'gbdt'
    params['metric'] = 'multi_logloss'
    params['bagging_fraction'] = 0.8 #subsample
    params['feature_fraction'] = 0.7 #colsample_bytree
    params['max_depth'] = 8
#     params['categorical_feature'] = "name:'Type_of_Cab',name:'Confidence_Life_Style_Index',name:'Destination_Type',name:'Gender'"
    d_train = lgb.Dataset(X_train, label=y_train)
    lg_cl = lgb.train(params, d_train, 100)
    return lg_cl

In [221]:
model4 = classification_model_lgb_multi(X, y)

       Trip_Distance  Type_of_Cab  Customer_Since_Months  Life_Style_Index  \
96178          55.17          0.0                   10.0           2.81043   
91428           8.02          4.0                    0.0        -999.00000   
14964          31.34          2.0                   10.0           2.70473   
25321          31.52       -999.0                   10.0           2.98292   
42425          19.23          0.0                    5.0           2.23333   

       Confidence_Life_Style_Index  Destination_Type  Customer_Rating  \
96178                          0.0                 0          4.04000   
91428                       -999.0                 8          2.58375   
14964                          1.0                 0          2.26500   
25321                          1.0                 0          3.27625   
42425                          0.0                 0          3.63375   

       Cancellation_Last_1Month   Var1  Var2  Var3  Gender  
96178                         2

In [218]:
mobility_test = mobility_test.fillna(-999)
X_val = mobility_test.iloc[:,1:]
# X_val['Type_of_Cab'] = X_val['Type_of_Cab'].astype('category')

In [222]:
X_val.head()

Unnamed: 0,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender
0,9.44,0.0,10.0,2.57438,0.0,1,3.68,2,-999.0,46,63,0
1,32.15,1.0,10.0,2.85143,0.0,0,1.59375,0,65.0,49,80,1
2,10.38,2.0,4.0,2.7053,0.0,3,4.505,0,-999.0,47,74,0
3,14.94,-999.0,6.0,2.48159,2.0,4,4.53,0,63.0,43,54,0
4,32.03,1.0,7.0,2.81598,0.0,10,4.60125,3,96.0,44,56,0


In [223]:
predictions = model4.predict(X_val)
predictions3 = []
for x in predictions:
    predictions3.append(np.argmax(x))
predictions4 = [x + 1 for x in predictions3]
# print(predictions4)

In [224]:
submit = pd.DataFrame({'Trip_ID': mobility_test['Trip_ID'], 'Surge_Pricing_Type': predictions4})
submit.to_csv("LGB09.csv", index=False)