In [74]:
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

In [75]:
data = pd.read_csv('ncr_ride_bookings.csv')
print(data.isna().sum())
print('всего строк: ',len(data))

Date                                      0
Time                                      0
Booking ID                                0
Booking Status                            0
Customer ID                               0
Vehicle Type                              0
Pickup Location                           0
Drop Location                             0
Avg VTAT                              10500
Avg CTAT                              48000
Cancelled Rides by Customer          139500
Reason for cancelling by Customer    139500
Cancelled Rides by Driver            123000
Driver Cancellation Reason           123000
Incomplete Rides                     141000
Incomplete Rides Reason              141000
Booking Value                         48000
Ride Distance                         48000
Driver Ratings                        57000
Customer Rating                       57000
Payment Method                        48000
dtype: int64
всего строк:  150000


In [76]:
print(data['Booking Status'].unique())
print(len(data[data['Customer ID'] == '"CID9933542"']))
data.head(5)

['No Driver Found' 'Incomplete' 'Completed' 'Cancelled by Driver'
 'Cancelled by Customer']
1


Unnamed: 0,Date,Time,Booking ID,Booking Status,Customer ID,Vehicle Type,Pickup Location,Drop Location,Avg VTAT,Avg CTAT,...,Reason for cancelling by Customer,Cancelled Rides by Driver,Driver Cancellation Reason,Incomplete Rides,Incomplete Rides Reason,Booking Value,Ride Distance,Driver Ratings,Customer Rating,Payment Method
0,2024-03-23,12:29:38,"""CNR5884300""",No Driver Found,"""CID1982111""",eBike,Palam Vihar,Jhilmil,,,...,,,,,,,,,,
1,2024-11-29,18:01:39,"""CNR1326809""",Incomplete,"""CID4604802""",Go Sedan,Shastri Nagar,Gurgaon Sector 56,4.9,14.0,...,,,,1.0,Vehicle Breakdown,237.0,5.73,,,UPI
2,2024-08-23,08:56:10,"""CNR8494506""",Completed,"""CID9202816""",Auto,Khandsa,Malviya Nagar,13.4,25.8,...,,,,,,627.0,13.58,4.9,4.9,Debit Card
3,2024-10-21,17:17:25,"""CNR8906825""",Completed,"""CID2610914""",Premier Sedan,Central Secretariat,Inderlok,13.1,28.5,...,,,,,,416.0,34.02,4.6,5.0,UPI
4,2024-09-16,22:08:00,"""CNR1950162""",Completed,"""CID9933542""",Bike,Ghitorni Village,Khan Market,5.3,19.6,...,,,,,,737.0,48.21,4.1,4.3,UPI


In [77]:
# Преобразуем колонки с Датой и Временем в кол-во дней с начала года и кол-во минут с начала дня соответственно
data['Day'] = data['Date'].apply(lambda x: datetime.date.fromisoformat(x).timetuple().tm_yday)
data['Minute'] = data['Time'].apply(lambda x: datetime.time.fromisoformat(x).minute + datetime.time.fromisoformat(x).hour * 60)

In [78]:
data[data['Booking Status'] == 'Completed'].isna().sum()

Date                                     0
Time                                     0
Booking ID                               0
Booking Status                           0
Customer ID                              0
Vehicle Type                             0
Pickup Location                          0
Drop Location                            0
Avg VTAT                                 0
Avg CTAT                                 0
Cancelled Rides by Customer          93000
Reason for cancelling by Customer    93000
Cancelled Rides by Driver            93000
Driver Cancellation Reason           93000
Incomplete Rides                     93000
Incomplete Rides Reason              93000
Booking Value                            0
Ride Distance                            0
Driver Ratings                           0
Customer Rating                          0
Payment Method                           0
Day                                      0
Minute                                   0
dtype: int6

У выполненных заказов нет Nan, кроме как в полях, связанных с невыполнением заказа.

In [None]:
# Получаем DataFrame только с выполненными заказами
comleted_bookings = data[data['Booking Status'] == 'Completed']
# Создаём список с нужными колонками и получаем ненужные с помощью вычитания множеств
need_columns = ['Ride Distance','Vehicle Type','Day','Minute','Driver Ratings','Booking Value']
drop_columns = list(set(comleted_bookings.columns) - set(need_columns))
# Удаляем ненужные колонки
comleted_bookings = comleted_bookings.drop(columns = drop_columns)

In [80]:
class LabelEncoder:
    def __init__(self):
        pass

    def fit_transform(self, data: pd.DataFrame, encoding_columns = None,target_column = None):
        X = data.copy()
        self.fit(X, encoding_columns, target_column)
        return self.transform(X, encoding_columns)
    
    def fit(self, data: pd.DataFrame, encoding_columns = None, target_column = None):
        X = data.copy()
        marks = {}
        
        if encoding_columns == None:
            encoding_columns = X.columns

        for column in encoding_columns:
            if X[column].dtype == object:
                marks[column] = {}
                unique_values = X[column].unique()
                # Сортировка
                if target_column is None:
                    for i in range(len(unique_values)):
                        marks[column][unique_values[i]] = i
                else:
                    means = {}
                    for i in range(len(unique_values)):
                        target_for_unique_value = X[X[column] == unique_values[i]][target_column]
                        mean_for_unique = target_for_unique_value.mean()
                        means[unique_values[i]] = mean_for_unique
                    means = dict(sorted(means.items(), key=lambda item:item[1]))
                    # Вычитание среднего и деление на дисперсию
                    means_mean = np.array(list(means.values())).mean()
                    means_var =  np.array(list(means.values())).var()
                    for key in means.keys():
                        marks[column][key] = (means[key] - means_mean)/(means_var)
        self.marks = marks
    
    def transform(self, data:pd.DataFrame, encoding_columns = None,):
        X = data.copy()
        marks = self.marks
        
        if encoding_columns == None:
            encoding_columns = X.columns

        for column in encoding_columns:
            if column in marks:
                X[column] = X[column].map(marks[column])
        return X

In [81]:
LE = LabelEncoder()

comleted_bookings = LE.fit_transform(comleted_bookings, target_column='Booking Value')
comleted_bookings.head(5)


Unnamed: 0,Vehicle Type,Booking Value,Ride Distance,Driver Ratings,Day,Minute
2,-0.160707,627.0,13.58,4.9,236,536
3,0.275613,416.0,34.02,4.6,295,1037
4,0.211518,737.0,48.21,4.1,260,1328
5,-0.160707,316.0,4.85,4.1,37,584
6,-0.033631,640.0,41.24,4.0,169,945


In [82]:
X = comleted_bookings.drop(columns=['Booking Value'])
y = comleted_bookings['Booking Value']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
models ={
    'grad_boost' : GradientBoostingRegressor(n_estimators=100, learning_rate=0.1),
    'linear_regression' : LinearRegression()
} 

print('Среднее целевой переменной:', y_test.mean(),'+-',y_test.std())

for model_name,model in models.items():   
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print('RMSE',model_name,':',root_mean_squared_error(y_test,pred))

Среднее целевой переменной: 505.69408602150537 +- 389.495448511916
RMSE grad_boost : 384.9608542190583
RMSE linear_regression : 389.50425111656
