In [None]:
import pandas as pd
import scipy.stats as stats

In [None]:
# Read CSV
file_path = 'MasterDataset_WithWeatherAndEvent.csv'
data = pd.read_csv(file_path, dtype={"holiday": str})

In [None]:
# Missing Values Analysis
def analyze_missing_values_v2(file_path, chunk_size=1000000):
    chunks = pd.read_csv(file_path, chunksize=chunk_size)
    total_missing_values = None

    for chunk in chunks:
        missing_values = chunk.isnull().sum()
        if total_missing_values is None:
            total_missing_values = missing_values
        else:
            total_missing_values += missing_values

    # Calculate the percentage of missing values
    missing_values_summary = pd.DataFrame(total_missing_values, columns=['Missing Count'])
    total_rows = sum(1 for row in open(file_path))
    missing_values_summary['Percentage (%)'] = (missing_values_summary['Missing Count'] / total_rows) * 100

    return missing_values_summary.sort_values(by='Missing Count', ascending=False)

In [None]:
# Analyze the missing values in the dataset again
missing_values_info_v2 = analyze_missing_values_v2("MasterDataset_WithWeatherAndEvent.csv")
missing_values_info_v2

In [None]:
# Define functions that handle missing values across the entire dataset
def process_missing_values_whole(file_path, output_path):
    # Read the entire data set
    data = pd.read_csv(file_path)

    # Calculate the mode of related columns to fill in missing values
    end_station_number_mode = data['End station number'].mode()[0]
    end_station_mode = data['End station'].mode()[0]
    start_station_number_mode = data['Start station number'].mode()[0]
    start_station_mode = data['Start station'].mode()[0]

    # Fill missing values using the mode calculated above and the specified string
    data['holiday'].fillna("Not a holiday",
                           inplace=True)  # Fill missing values of 'holiday' column using "Not a holiday"
    data_cleaned = data.dropna(subset=df.columns.difference(['holiday']))
    data['End station number'].fillna(end_station_number_mode,
                                      inplace=True)  # Fill missing values of 'End station number' column using mode
    data['End station'].fillna(end_station_mode, inplace=True)  # Fill missing values of 'End station' column using mode
    data['Start station number'].fillna(start_station_number_mode,
                                        inplace=True)  # Fill missing values of 'Start station number' column using mode
    data['Start station'].fillna(start_station_mode,
                                 inplace=True)  # Fill missing values of 'Start station' column using mode

    # Save the processed data to the specified output path
    data.to_csv(output_path, index=False)


In [None]:
# process the data and save it
output_path_whole = "processed_sample_whole2.csv"
process_missing_values_whole("sample.csv", output_path_whole)

In [None]:
# Analyze the missing values in the dataset again
missing_values_info_v2 = analyze_missing_values_v2("sample.csv")
missing_values_info_v2

In [None]:
#Bivariate Analysis
# Convert start date, end date and date columns into datetime objects
data['Start date'] = pd.to_datetime(data['Start date'])
data['End date'] = pd.to_datetime(data['End date'])
data['date'] = pd.to_datetime(data['date'])

In [None]:
# Extract temporal features
# Extract hour, day of week and month from start date
data['hour'] = data['Start date'].dt.hour
data['w_day'] = data['Start date'].dt.dayofweek  
data['month'] = data['Start date'].dt.month

In [None]:
# Bivariate analysis
# Group by hour, day of week and month, calculate the average riding time of each group
hourly_avg_duration = data.groupby('hour')['Duration'].mean()
weekly_avg_duration = data.groupby('w_day')['Duration'].mean()
monthly_avg_duration = data.groupby('month')['Duration'].mean()

In [None]:
# Create a binary column representing weekends and then calculate the average ride time grouped
data['isWeekend'] = data['w_day'].apply(lambda x: 1 if x >= 5 else 0)
weekend_weekday_avg_duration = data.groupby('isWeekend')['Duration'].mean()

In [None]:
# Create a binary column representing weekends and then calculate the average ride time grouped
holiday_avg_duration = data.groupby('isHoliday')['Duration'].mean()

In [None]:
# Analysis of the relationship between weather conditions and riding time
# Calculate the Pearson correlation coefficient between weather variables and riding time
correlation_temp = stats.pearsonr(data['temperature_2m'], data['Duration'])
correlation_rain = stats.pearsonr(data['precipitation'], data['Duration'])
correlation_humidity = stats.pearsonr(data['relativehumidity_2m'], data['Duration'])
correlation_windspeed = stats.pearsonr(data['windspeed_10m'], data['Duration'])

In [None]:
# Multivariate analysis
# Distinguish between casual and member riders
data['isCasual'] = data['Member type'].apply(lambda x: 1 if x == 'Casual' else 0)

In [None]:
# Define a function to calculate the average riding time of different rider types under specific conditions
def avg_duration_by_group(df, group_by, condition_col, condition):
    filtered = df[df[condition_col] == condition]
    return filtered.groupby(group_by)['Duration'].mean()

# Compare average ride times for recreational cyclists and members across a variety of weather conditions and time periods
casual_member_comparison = {
    'precipitation': avg_duration_by_group(data, 'isCasual', 'precipitation', 1),
    'temperature': avg_duration_by_group(data, 'isCasual', 'temperature_2m', data['temperature_2m'].median()),
    'humidity': avg_duration_by_group(data, 'isCasual', 'relativehumidity_2m', data['relativehumidity_2m'].median()),
    'windspeed': avg_duration_by_group(data, 'isCasual', 'windspeed_10m', data['windspeed_10m'].median()),
    'weekdays_weekends': avg_duration_by_group(data, 'isCasual', 'isWeekend', 1),
    'holidays': avg_duration_by_group(data, 'isCasual', 'isHoliday', 1)
}

# Compare duration between casual riders and members with temperature
temperature_comparison = {
    'Casual': avg_duration_by_group(data, 'temperature_2m', 'isCasual', 1),
    'Member': avg_duration_by_group(data, 'temperature_2m', 'isCasual', 0)
}

print("\nComparison of Duration Between Casual Riders and Members with Temperature:\n", temperature_comparison)

In [None]:
import matplotlib.pyplot as plt

# Plotting the comparison
plt.figure(figsize=(12, 6))
plt.plot(temperature_comparison['Casual'], label='Casual', marker='o')
plt.plot(temperature_comparison['Member'], label='Member', marker='o')

plt.title('Comparison of Duration Between Casual Riders and Members with Temperature')
plt.xlabel('Temperature')
plt.ylabel('Average Duration')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
def multivariate_analysis(data, group_by, conditions):
    results = {}
    for condition in conditions:
        grouped_data = data.groupby([group_by, condition]).mean()['Duration']
        results[condition] = grouped_data.unstack()
    return results

conditions = ['precipitation', 'temperature_2m', 'relativehumidity_2m', 'windspeed_10m', 'w_day', 'isHoliday']

multivariate_results = multivariate_analysis(data, 'Member type', conditions)

In [None]:
def plot_multivariate_analysis(grouped_data, condition, title):
    plt.figure(figsize=(15, 6))
    sns.lineplot(data=grouped_data, markers=True, dashes=False)
    plt.title(title)
    plt.xlabel(condition)
    plt.ylabel('Average Duration (seconds)')
    plt.xticks(rotation=45)
    plt.legend(title='Member Type')
    plt.grid(True)
    plt.show()


plot_multivariate_analysis(multivariate_results['precipitation'], 'Precipitation (mm)', 'Average Duration vs Precipitation for Casual Riders and Members')

print(correlations)