In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import joblib

# Load the dataset
data = pd.read_csv('uber_data.csv')

# Convert 'payment_type' column to numeric, handling errors by coercing to NaN
data['payment_type'] = pd.to_numeric(data['payment_type'], errors='coerce')

# Drop rows with missing values
data.dropna(inplace=True)

# Define features and target variable
X = data.drop(columns=['total_amount'])  # Features
y = data['total_amount']  # Target variable

# Define numerical and categorical features
numeric_features = ['passenger_count', 'trip_distance']
categorical_features = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']

# Define preprocessing steps for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('feature_selection', SelectKBest(score_func=f_regression, k=5)),
                        ('regressor', RandomForestRegressor())])

# Train the model
model.fit(X, y)

# Save the trained model to a file
joblib.dump(model, 'model.pkl')

# Take user input for prediction
passenger_count = int(input("Enter the number of passengers: "))
trip_distance = float(input("Enter the trip distance: "))
pickup_latitude = float(input("Enter the pickup latitude: "))
pickup_longitude = float(input("Enter the pickup longitude: "))
dropoff_latitude = float(input("Enter the dropoff latitude: "))
dropoff_longitude = float(input("Enter the dropoff longitude: "))

# Make predictions on new data
new_data = pd.DataFrame({
    'passenger_count': [passenger_count],
    'trip_distance': [trip_distance],
    'pickup_latitude': [pickup_latitude],
    'pickup_longitude': [pickup_longitude],
    'dropoff_latitude': [dropoff_latitude],
    'dropoff_longitude': [dropoff_longitude]
})

# Predict trip fare
predicted_fare = model.predict(new_data)
print("Predicted Trip Fare:", (predicted_fare)*80)


Predicted Trip Fare: [587.58380151]
