# Preprocessing

## Libraries

In [2]:
import pandas as pd

## Datapreprocessing

In [8]:

def parse_date(date_str):
    try:
        return pd.to_datetime(date_str, format="%m/%d/%Y %H:%M")
    except ValueError:
        # If that fails, try the ISO8601 format: year-month-day hour:minute:second
        try:
            return pd.to_datetime(date_str, format="%Y-%m-%d %H:%M:%S")
        except ValueError:
            # Return NaT if both formats fail
            print(f"Could not parse date: {date_str}")
            return pd.NaT


# Load the CSV file into a pandas dataframe
df = pd.read_csv('ColoradoData.csv')

# Strip extra whitespace from the column names
df.columns = [col.strip() for col in df.columns]

# Strip extra whitespace from the date columns
df['Start_Date___Time'] = df['Start_Date___Time'].str.strip()
df['End_Date___Time'] = df['End_Date___Time'].str.strip()

# Now parse the datetime columns using the parse_date function
df['Start_DateTime'] = df['Start_Date___Time'].apply(parse_date)
df['End_DateTime'] = df['End_Date___Time'].apply(parse_date)

# Convert duration columns to timedelta objects
df['Total_Duration'] = pd.to_timedelta(df['Total_Duration__hh_mm_ss_'])
df['Charging_Time'] = pd.to_timedelta(df['Charging_Time__hh_mm_ss_'])

# Convert Energy consumption to a numeric type (handle errors)
df['Energy_Consumption'] = pd.to_numeric(df['Energy__kWh_'], errors='coerce')

# Remove the columns that are no longer needed
df = df.drop(columns=['Start_Date___Time', 'End_Date___Time', 'Total_Duration__hh_mm_ss_',
             'Charging_Time__hh_mm_ss_', 'Energy__kWh_', 'ObjectID', 'ObjectId2', 'Start_Time_Zone', 'End_Time_Zone', 'Port_Type'])

df.set_index('Start_DateTime', inplace=True)

## More data cleaning
# Remove rows with negative energy consumption
df = df[df['Energy_Consumption'] >= 0]

# Remove rows with missing values
df = df.dropna()

# Remove rows with zero charging time
df = df[df['Charging_Time'] > pd.Timedelta(0)]

# Remove rows with zero energy consumption
df = df[df['Energy_Consumption'] > 0]

# Remove rows with zero total duration
df = df[df['Total_Duration'] > pd.Timedelta(0)]

# Remove rows with total duration less than charging time
df = df[df['Total_Duration'] >= df['Charging_Time']]

# Verify the conversion by printing the data types
print("Optimized dataset:")
print(df.head())



Optimized dataset:
                                        Station_Name             Address  \
Start_DateTime                                                             
2018-01-01 17:49:00  BOULDER / JUNCTION ST1           2280 Junction Pl     
2018-01-02 08:52:00  BOULDER / JUNCTION ST1           2280 Junction Pl     
2018-01-02 21:11:00  BOULDER / JUNCTION ST1           2280 Junction Pl     
2018-01-03 09:19:00  BOULDER / ALPINE ST1             1275 Alpine Ave      
2018-01-03 14:13:00  BOULDER / BASELINE ST1           900 Baseline Rd      

                         City   State_Province  Zip_Postal_Code  \
Start_DateTime                                                    
2018-01-01 17:49:00  Boulder   Colorado                   80301   
2018-01-02 08:52:00  Boulder   Colorado                   80301   
2018-01-02 21:11:00  Boulder   Colorado                   80301   
2018-01-03 09:19:00  Boulder   Colorado                   80304   
2018-01-03 14:13:00  Boulder   Colorado       

## Features 

In [None]:
## 