# Import the necessary libraries

In [None]:
import pandas as pd
import os
import glob

# Dealing with files

In [None]:
# Set the folder path
folder_path = './data/csv/'
folder_path

In [None]:
# Get a list of all CSV files in the specified folder
csv_files = sorted(glob.glob(os.path.join(folder_path, '*.csv')))
csv_files

In [None]:
# Create an empty list to store the DataFrames and iterate over the CSV files
df_list = []

# Iterate over the CSV files
for file in csv_files:
    df = pd.read_csv(file)
    df_list.append(df)

In [None]:
# Concatenate all the DataFrames in the list into a single DataFrame
df = pd.concat(df_list, ignore_index=True)

# Drop duplicate rows

In [None]:
# Count the number of duplicate ride IDs
df.duplicated(['ride_id']).sum()

In [None]:
# Drop duplicate rows based on the 'ride_id' column, keeping the first occurrence
df.drop_duplicates(subset='ride_id', keep='first', inplace=True)

In [None]:
# Count the number of duplicate ride IDs
df.duplicated(['ride_id']).sum()

# Converting columns to datetime

In [None]:
# Convert the 'started_at' column to datetime format with mixed format
df['new_started_at'] = pd.to_datetime(df['started_at'], format='mixed')

In [None]:
# Check if there are any missing values in the 'new_started_at' column
df['new_started_at'].isnull().sum()

In [None]:
# Convert the 'ended_at' column to datetime format with mixed format
df['new_ended_at'] = pd.to_datetime(df['ended_at'], format='mixed').dt.floor('s')

In [None]:
# Check if there are any missing values in the 'new_ended_at' column
df['new_ended_at'].isnull().sum()

# Calculate the ride length

In [None]:
df['ride_length'] = df['new_ended_at']-df['new_started_at']
df.head()

In [None]:
df['ride_length_sec'] = df['ride_length'].dt.total_seconds()
df

In [None]:
# Count the number of rows where the ride length is less than 60 seconds
(df['ride_length_sec'] < 60).sum()

In [None]:
# Filter the dataframe to only include rows where the ride length is greater than or equal to 60 seconds
df = df[df['ride_length_sec'] >= 60]
df

# Create a new column in the dataframe called 'day_of_week'

In [None]:
df['day_of_week'] = df['new_started_at'].dt.day_name()


# Replacing columns

In [None]:
df['started_at'] = df['new_started_at']
df['ended_at'] = df['new_ended_at']

# Dropping unused columns

In [None]:
df.drop('new_started_at', axis=1, inplace=True)
df.drop('new_ended_at', axis=1, inplace=True)
df.drop('ride_length', axis=1, inplace=True)

# Saving to the file

In [None]:
df.to_csv('./data/full_data.csv', index=False)