In [2]:
import pandas as pd
import sys
import os

# Add the project directory to the Python path
project_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_dir)

# Import from src
from src.data_processing import DataProcessor, DropColumnsStrategy, ConvertToNumericStrategy, ConvertTimeStrategy, ExtractTimeFeatureStrategy, RemoveWeekendData
from src.trip_processing import TripProcessor, ClusterStopsStrategy, CreateTripSegments, FilterTripSegments, RemoveLongDurationTrips
from src.data_ingestion import load_data

In [10]:
def load_data_from_files(file_info_list):
    dfs = []
    for file_info in file_info_list:
        file_path = file_info['file_path']
        file_type = file_info['file_type']
        correction_factor = file_info.get('correction_factor')
        
        df = load_data(file_path, file_type=file_type, correction_factor=correction_factor)
        dfs.append(df)
    
    return pd.concat(dfs, ignore_index=True)

def prepare_data(df):
    processor = DataProcessor([
        DropColumnsStrategy(['flag', 'speed', 'imei', 'bus_id']),
        ConvertToNumericStrategy(['latitude', 'longitude']),
        ConvertTimeStrategy(),
        ExtractTimeFeatureStrategy(),
        RemoveWeekendData(),
    ])
    return processor.process(df)

def process_trips(df):
    processor = TripProcessor([
        ClusterStopsStrategy(eps=0.1, min_samples=5),
        CreateTripSegments(),
        FilterTripSegments(),
        RemoveLongDurationTrips()
    ])
    return processor.process(df)

# define file locations
file_info_list = [
    {'file_path': os.path.join(project_dir, 'data', 'raw', 'location_data_Feb2024.csv'), 'file_type': 'csv'},
    {'file_path': os.path.join(project_dir, 'data', 'raw', 'location_data_Mar2024.csv'), 'file_type': 'csv'},
    {'file_path': os.path.join(project_dir, 'data', 'raw', 'location_data_Apr2024.csv'), 'file_type': 'csv'},
    {'file_path': os.path.join(project_dir, 'data', 'raw', 'location_data_May2024.csv'), 'file_type': 'csv'},
    {'file_path': os.path.join(project_dir, 'data', 'raw', 'location_data_June2024.csv'), 'file_type': 'csv'},
    {'file_path': os.path.join(project_dir, 'data', 'raw', 'location_data_July2024.csv'), 'file_type': 'csv'},
    {'file_path': os.path.join(project_dir, 'data', 'raw', 'location_data_Aug2024.csv'), 'file_type': 'csv'},
    {'file_path': os.path.join(project_dir, 'data', 'raw', 'location_data_Sep2024.csv'), 'file_type': 'csv'},
]

In [12]:
combined_df = load_data_from_files(file_info_list)
combined_df


Unnamed: 0,bus_id,route_id,imei,latitude,longitude,speed,time,flag
0,33,10,f2:ab:73:19:59:7,-7.93324,112.60354,10,2024-02-01 06:54:08,1
1,33,10,f2:ab:73:19:59:7,-7.93324,112.60354,10,2024-02-01 06:58:14,1
2,33,10,f2:ab:73:19:59:7,-7.94329,112.61047,10,2024-02-01 07:00:07,1
3,33,10,f2:ab:73:19:59:7,-7.94979,112.61559,10,2024-02-01 07:09:44,0
4,33,10,f2:ab:73:19:59:7,-7.94979,112.61559,10,2024-02-01 07:09:49,1
...,...,...,...,...,...,...,...,...
72313,33,11,f2:ab:73:19:59:7,-7.94331,112.61043,10,2024-09-30 18:14:44,1
72314,33,11,f2:ab:73:19:59:7,-7.94331,112.61043,10,2024-09-30 18:15:04,0
72315,33,11,f2:ab:73:19:59:7,-7.94331,112.61043,10,2024-09-30 18:15:17,0
72316,33,11,f2:ab:73:19:59:7,-7.94331,112.61043,10,2024-09-30 18:15:31,1


In [13]:
prepared_df = prepare_data(combined_df)
prepared_df


Unnamed: 0,route_id,latitude,longitude,time,hour,weekday,day_name,month,month_name
0,10,-7.93324,112.60354,2024-02-01 06:54:08,6,3,Thursday,2,February
1,10,-7.93324,112.60354,2024-02-01 06:58:14,6,3,Thursday,2,February
2,10,-7.94329,112.61047,2024-02-01 07:00:07,7,3,Thursday,2,February
3,10,-7.94979,112.61559,2024-02-01 07:09:44,7,3,Thursday,2,February
4,10,-7.94979,112.61559,2024-02-01 07:09:49,7,3,Thursday,2,February
...,...,...,...,...,...,...,...,...,...
69657,15,1.53818,103.62837,2024-08-30 21:28:24,21,4,Friday,8,August
69658,15,1.53818,103.62837,2024-08-30 21:28:42,21,4,Friday,8,August
69659,15,1.53818,103.62837,2024-08-30 21:29:12,21,4,Friday,8,August
69740,15,1.46309,103.76537,2024-08-30 22:05:10,22,4,Friday,8,August


To do: Fix Trips

In [None]:
# Process trips
trips_df = process_trips(prepared_df)

# Display the processed data
trips_df

In [5]:
trips_df

Unnamed: 0,route_id,start_time,end_time,start_hour,end_hour,start_lat,start_lon,end_lat,end_lon,start_stop_id,end_stop_id,duration,distance,segment
2898,11,2024-09-30 18:15:45,2024-08-01 03:09:12,18,3,-7.94331,112.61043,1.66127,103.60342,0,1,-87306.55,1462.282323,"LINESTRING (112.61043 -7.94331, 103.60342 1.66..."
