In [2]:
import pandas as pd
import os
import git
import pyarrow.parquet as pq


def get_git_root():
    try:
        # Create a GitPython Repo object
        repo = git.Repo(search_parent_directories=True)

        # Get the absolute path of the root folder
        git_root = repo.git.rev_parse("--show-toplevel")

        return git_root
    except git.InvalidGitRepositoryError:
        # Handle the case where the current directory is not a Git repository
        print("Not a Git repository or an error occurred.")
        return None

# Get the root folder of the Git repository
root_folder = get_git_root()
# load parquet file from dataset dir

dataset_dir=os.path.join(root_folder,"data/raw")
dataset_file="yellow_tripdata_2023-01.parquet"
taxi_zone_file="taxi+_zone_lookup.csv"
dataset_file_path=os.path.join(dataset_dir,dataset_file)
taxi_zone_file_path=os.path.join(dataset_dir,taxi_zone_file)
dataset_processed_dir=os.path.join(root_folder,"data/processed")
dataset_processed_file="processed.csv"
dataset_processed_zone_file="zone.csv"
dataset_processed_file_path=os.path.join(dataset_processed_dir,dataset_processed_file)
dataset_processed_zone_file_path=os.path.join(dataset_processed_dir,dataset_processed_zone_file)


trips = pq.read_table(dataset_file_path)
zone=pd.read_csv(taxi_zone_file_path)
df= trips.to_pandas()
df.head()




Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [3]:
#Data Preparation
'''
1 Data Inspection: 
    We'll start by inspecting the dataset to see if there are any missing values, duplicates, or inconsistent data. We'll also check if the data types are correct and make sure the dataset is ready for analysis.

2 Data Cleaning: 
    Next, we'll clean the dataset by removing or correcting any errors, inconsistencies, or irrelevant information. This will make the dataset more reliable and accurate.

3 Data Transformation: 
    #this can be done through DBT
    After cleaning the dataset, we may need to transform the data to make it more useful for analysis. This can include scaling, normalization, or feature engineering.

4 Data Saving: 
    Once we've prepared the data, we'll save it in a new file to avoid overwriting the original dataset. This way, we can always go back to the original dataset if we need to.

'''

# Check data types
def check_data_types(dataframe):
    return dataframe.dtypes

# Check for duplicates
def check_duplicates(dataframe):
    return dataframe.duplicated().sum()

# Check for missing values
def check_missing_values(dataframe):
    return dataframe.isnull().sum()

#check for missing values and return column with missing value
def get_column_with_missing_value(df):
    missing_values=check_missing_values(df)
    return missing_values[missing_values > 0].index


In [5]:
# #1,2-data inspection and cleaning

print(df.shape)
#check duplicate
print(check_duplicates(df))
#drop duplicate
if check_duplicates(df):
    df = df.drop_duplicates().reset_index(drop=True)
    df['trip_id'] = df.index
    print(df.shape)
#check missing value
print(check_missing_values(df))
columns=get_column_with_missing_value(df)

# Remove rows with missing values in the rating_count column
df.dropna(subset=columns, inplace=True)
print(df.shape)
df.head()

#convert passenger_count and ratecodeID to int
df['passenger_count']=df['passenger_count'].astype(int)
df['RatecodeID']=df['RatecodeID'].astype(int)

print(check_data_types(df))

# #some of the zone data is unknown and if that location data is used for analysis will result in bad output
#removing that locationid in zone file and removing reference from tripdata
location_to_remove=zone[zone['Borough'] == "Unknown"]['LocationID'].tolist()
new_zone=zone[zone['Borough'] != "Unknown"]
new_zone.to_csv(dataset_processed_zone_file_path, index=False)
print(df[(df['PULocationID'].isin(location_to_remove) | df['DOLocationID'].isin(location_to_remove))])
df=df[~(df['PULocationID'].isin(location_to_remove) | df['DOLocationID'].isin(location_to_remove))]
print(df.shape)
#column with rate code 99 : it seems cabb driver is driving with a personal person
print(df[~df['RatecodeID'].isin([1,2,3,4,5,6]) ])

#save clean data to processed data
df.to_csv(dataset_processed_file_path, index=False)
print(df.columns)
print(df.shape)


(2937423, 19)
0
VendorID                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
RatecodeID               0
store_and_fwd_flag       0
PULocationID             0
DOLocationID             0
payment_type             0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
congestion_surcharge     0
airport_fee              0
dtype: int64
(2937423, 19)
VendorID                          int64
tpep_pickup_datetime     datetime64[us]
tpep_dropoff_datetime    datetime64[us]
passenger_count                   int64
trip_distance                   float64
RatecodeID                        int64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extr

In [None]:
#data transformation 
# data modeling and storing

