In [1]:
# # Install the library
# !pip install -q "great-expectations==0.18.19"

In [2]:
# create a data context
from great_expectations.data_context import FileDataContext

import pandas as pd
from great_expectations.data_context import FileDataContext


context = FileDataContext.create(project_root_dir='./')

In [None]:
# Load data
df = pd.read_csv('./uber_data_clean.csv')

# create new column unique_transaction_id
df['unique_transaction_id'] = df['booking_id'].astype(str) + '_' + df['date'].astype(str) + '_' + df['time'].astype(str)


context = FileDataContext.create(project_root_dir='./')

#add datasource
datasource_name = 'uber-ride-data'
datasource = context.sources.add_pandas(name=datasource_name)

# add asset
asset_name = 'uber-trips-2024'
asset = datasource.add_dataframe_asset(name=asset_name, dataframe=df)

# set batch request
batch_request = asset.build_batch_request()

# Add expectation 
expectation_suite_name = 'uber_ride_data_validation'
context.add_or_update_expectation_suite(expectation_suite_name)

# create validator
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name
)


validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,date,time,booking_id,booking_status,customer_id,vehicle_type,pickup_location,drop_location,avg_vtat,avg_ctat,...,cancelled_rides_by_driver,driver_cancellation_reason,incomplete_rides,incomplete_rides_reason,booking_value,ride_distance,driver_ratings,customer_rating,payment_method,unique_transaction_id
0,2024-03-23,12:29:38,"""CNR5884300""",No Driver Found,"""CID1982111""",eBike,Palam Vihar,Jhilmil,8.456352,29.149636,...,1.0,Unknown,1.0,Unknown,508.295912,24.637012,4.230992,4.404584,Unknown,"""CNR5884300""_2024-03-23_12:29:38"
1,2024-11-29,18:01:39,"""CNR1326809""",Incomplete,"""CID4604802""",Go Sedan,Shastri Nagar,Gurgaon Sector 56,4.9,14.0,...,1.0,Unknown,1.0,Vehicle Breakdown,237.0,5.73,4.230992,4.404584,UPI,"""CNR1326809""_2024-11-29_18:01:39"
2,2024-08-23,08:56:10,"""CNR8494506""",Completed,"""CID9202816""",Auto,Khandsa,Malviya Nagar,13.4,25.8,...,1.0,Unknown,1.0,Unknown,627.0,13.58,4.9,4.9,Debit Card,"""CNR8494506""_2024-08-23_08:56:10"
3,2024-10-21,17:17:25,"""CNR8906825""",Completed,"""CID2610914""",Premier Sedan,Central Secretariat,Inderlok,13.1,28.5,...,1.0,Unknown,1.0,Unknown,416.0,34.02,4.6,5.0,UPI,"""CNR8906825""_2024-10-21_17:17:25"
4,2024-09-16,22:08:00,"""CNR1950162""",Completed,"""CID9933542""",Bike,Ghitorni Village,Khan Market,5.3,19.6,...,1.0,Unknown,1.0,Unknown,737.0,48.21,4.1,4.3,UPI,"""CNR1950162""_2024-09-16_22:08:00"


In [4]:
# Expectation 1 : Column `unique_transaction_id` is unique
validator.expect_column_values_to_be_unique('unique_transaction_id')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 150000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

Ekspektasi pada kolom unique_transaction_id succes. Ini menunjukkan bahwa semua nilai di kolom tersebut unik dan tidak ada duplikasi sama sekali

In [5]:
# Expectation 2: customer_rating is between 1 and 5

validator.expect_column_values_to_be_between(
    column='customer_rating', min_value=1, max_value=5
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 150000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

Ekspektasi pada kolom customer_rating succes. Ini menunjukkan bahwa semua value di kolom tersebut bernilai 1-5

In [6]:
#Expectation 3: ride_distance has a valid data type

validator.expect_column_values_to_be_in_type_list(
    'ride_distance', ['float', 'integer']
)

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

Ekspektasi pada kolom ride_distance success. Ini menunjukkan kolom tersebut sudah bertipe data float/integer dikarenakan ride distance adalah untuk kolom jarak perjalanan yang perlu diisi angka numerical

In [7]:
#Expectation 4: ride_distance betwwen min 1 max 30

validator.expect_column_mean_to_be_between('ride_distance', min_value=1, max_value=30)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 24.637011666666666
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

Ekspektasi pada kolom ride_distance success. Ini menunjukkan kolom tersebut sudah memiliki rata rata minimal 1 km - 30 km value

In [8]:
#Expectation 5: booking_status is a valid status
validator.expect_column_values_to_be_in_set(
    'booking_status', 
    ['Completed', 'Cancelled by Customer', 'Cancelled by Driver', 'No Driver Found', 'Incomplete', 'Pending', 'Scheduled', 'In Transit'] 
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 150000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

Ekspektasi pada kolom booking_status success. Ini menunjukkan bahwa semua data pada kolom booking_status sudah sesuai dengan daftar status yang valid dengan tidak ada nilai yang di luar daftar tersebut

In [9]:
#Expectation 6: driver_cancellation_reason is not null 

validator.expect_column_values_to_not_be_null(
    'driver_cancellation_reason',
    mostly=0.95, 
    condition_list=[{'condition': 'booking_status', 'op': 'eq', 'value': 'Cancelled by Driver'}]
)

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 150000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

Ekspektasi pada kolom driver_cancellation_reason success. Ini menunjukkan bahwa setiap kali booking_status bernilai 'Cancelled by Driver' maka kolom driver_cancellation_reason tidak boleh kosong

In [10]:
#Expectation 7: booking_value has a minimum value 0

validator.expect_column_min_to_be_between('booking_value', min_value=0)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 50.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

Ekspektasi pada kolom booking_value success. Ini menunjukkan bahwa nilai terkecil pada kolom booking_value adalah 0 maka tidak ada nilai negatif di dalamnya

In [11]:
validator.save_expectation_suite(discard_failed_expectations=False)