In [1]:
# Import necessary libraries
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score

# load and combine data from CSV files
def load_and_combine_data(files):
    dfs = [pd.read_csv(file) for file in files]
    df = pd.concat(dfs)
    return df


# calculate haversine distance ^^
def haversine(row):
    lon1=row["start_lng"]
    lon2=row["end_lng"]
    lat1=row["start_lat"]
    lat2=row["end_lat"]
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(math.radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = math.sin(dlat/2)*2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)*2
    c = 2 * math.asin(math.sqrt(abs(a))) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r

# calculate trip duration in minutes
def trip_duration(row):
    start=datetime.strptime(row["started_at"],"%Y-%m-%d %H:%M:%S").hour
    end=datetime.strptime(row["ended_at"],"%Y-%m-%d %H:%M:%S").hour
    mins=abs(start-end)*60
    start=datetime.strptime(row["started_at"],"%Y-%m-%d %H:%M:%S").minute
    end=datetime.strptime(row["ended_at"],"%Y-%m-%d %H:%M:%S").minute
    if mins==0:
        mins=abs(start-end)
    return mins

# preprocess data
def preprocess_data(df):
    # Drop rows with missing data
    df = df.dropna()

    # One-hot encode categorical features
    df = pd.get_dummies(df, columns=["member_casual", "start_station_name", "end_station_name"], drop_first=True)

    # Scale numeric features
    scaler = preprocessing.StandardScaler()
    df[["Haversine", "Minutes"]] = scaler.fit_transform(df[["Haversine", "Minutes"]])

    return df

# train and evaluate model
def train_and_evaluate(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    # Train Logistic Regression model
    logreg = LogisticRegression()
    logreg.fit(X_train, y_train)

    # Make predictions
    y_pred = logreg.predict(X_test)

    # Print performance metrics
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))

    # Plot confusion matrix
    cnf_matrix = confusion_matrix(y_test, y_pred)
    # ... (same as original code)



In [2]:
# Load data
files = ['JC-202107-citibike-tripdata.csv', 'JC-202108-citibike-tripdata.csv', 'JC-202109-citibike-tripdata.csv']
df = load_and_combine_data(files)

FileNotFoundError: [Errno 2] No such file or directory: 'JC-202107-citibike-tripdata.csv'

In [None]:
# Load data
    files = ['JC-202107-citibike-tripdata.csv', 'JC-202108-citibike-tripdata.csv', 'JC-202109-citibike-tripdata.csv']
    df = load_and_combine_data(files)

    # Calculate haversine distance and trip duration
    df["Haversine"] = df.apply(haversine, axis=1)
    df["Minutes"] = df.apply(trip_duration, axis=1)

    # Preprocess data
    df = preprocess_data(df)

    # Exploratory data analysis
    # Distribution of trip duration
    sns.histplot(data=df, x='Minutes', bins=50)
    plt.title('Distribution of Trip Duration')
    plt.xlabel('Minutes')
    plt.ylabel('Count')
    plt.show()

    # Distribution of Haversine distance
    sns.histplot(data=df, x='Haversine', bins=50)
    plt.title('Distribution of Haversine Distance')
    plt.xlabel('Haversine Distance (km)')
    plt.ylabel('Count')
    plt.show()

    # Box plot of trip duration by user type
    sns.boxplot(data=df, x='member_casual', y='Minutes')
    plt.title('Trip Duration by User Type')
    plt.xlabel('User Type')
    plt.ylabel('Minutes')
    plt.show()

    # Box plot of Haversine distance by user type
    sns.boxplot(data=df, x='member_casual', y='Haversine')
    plt.title('Haversine Distance by User Type')
    plt.xlabel('User Type')
    plt.ylabel('Haversine Distance (km)')
    plt.show()

    # Select features and target
    X = df.drop(columns=["member_casual_1"])
    y = df["member_casual_1"]

    # Train and evaluate model
    train_and_evaluate(X, y)