In [None]:
import pandas as pd

In [None]:
# Read the Dataset
dataframe = pd.read_csv("/kaggle/input/uber-fares-dataset/uber.csv")

In [None]:
# Create a Copy of the Dataset, we will work on the Copy and not tamper Actual Dataset
df = dataframe

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
# Since we have only 1 row with missing values, we can drop it directly
df.dropna(inplace = True)

In [None]:
df.isna().sum()

In [None]:
df.drop(["Unnamed: 0", "key"], axis = 1, inplace = True)

In [None]:
df.describe()

In [None]:
# Convert pickup_datetime column to its proper datatype
df["pickup_datetime"] = pd.to_datetime(df["pickup_datetime"], errors='coerce')  # to handle errors by coercing invalid parsing to NaT

In [None]:
df.info()

In [None]:
# Visualize the Outliers for fare_amount
import matplotlib.pyplot as plt
plt.boxplot(df["fare_amount"])

In [None]:
# Function to find the Outliers
def find_outliers(df):
    q1 = df.quantile(0.25) # 1st Quartile
    q3 = df.quantile(0.75) # 3d Quartile
    IQR = q3-q1 # InterQuartile Range
    outliers = df[(df<q1-1.5*IQR) | (df>q3+1.5*IQR)]
    return outliers

In [None]:
# Outlier Insights
fare_amount_outliers = find_outliers(df["fare_amount"])
print("Number of Outlier : " + str(len(fare_amount_outliers)))
print("Max Outlier : " + str(fare_amount_outliers.max()))
print("Min Outlier : " + str(fare_amount_outliers.min()))

In [None]:
# Drop the Outliers
q_low = df["fare_amount"].quantile(0.25)
q_hi  = df["fare_amount"].quantile(0.75)
iqr = q_hi - q_low
df = df[(df["fare_amount"] < q_hi+1.5*iqr) & (df["fare_amount"] > q_low-1.5*iqr)]

In [None]:
# Drop Rows having fare_amount as negative
df.drop(df[df['fare_amount'] < 0].index, inplace = True)

In [None]:
# Co-relation Heatmap
import seaborn as sns
sns.heatmap(df.corr(), annot = True)

In [None]:
x = df.drop("fare_amount", axis = 1) # Independent Set
y = df['fare_amount'] # Target Variable

In [None]:
# Important before we fit the model
x['pickup_datetime'] = pd.to_numeric(pd.to_datetime(x['pickup_datetime']))

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Split the Dataset into Testing and Training Parts
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)
# Random State is fixed because the nature of splitting the Data into testing and training parts is random,
# Random State helps us reproduce the same results each time

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression

In [None]:
lrmodel = LinearRegression()
lrmodel.fit(x_train, y_train) # Fit the Model

In [None]:
predict = lrmodel.predict(x_test)
predict

In [None]:
# Analyze the Metric Scores
from sklearn import metrics
import numpy as np
print(f"Mean absolute error {metrics.mean_absolute_error(y_test, predict)}")
print(f"Mean squared error {metrics.mean_squared_error(y_test, predict)}")
print(f"Root mean squared error {np.sqrt(metrics.mean_squared_error(y_test, predict))}")
# print(metrics.r2_score(y_test,predict))

In [None]:
# Random Forest Regression
from sklearn.ensemble import RandomForestRegressor

In [None]:
# n_estimators = Number of Decision Trees
rfmodel = RandomForestRegressor(n_estimators=100,random_state=101)

In [None]:
# Fit the Model
rfmodel.fit(x_train,y_train)

In [None]:
rfpredict = rfmodel.predict(x_test)
rfpredict

In [None]:
# Analyze th Metric Scores
from sklearn import metrics
import numpy as np
print(f"Mean absolute error {metrics.mean_absolute_error(y_test, rfpredict)}")
print(f"Mean squared error {metrics.mean_squared_error(y_test, rfpredict)}")
print(f"Root mean squared error {np.sqrt(metrics.mean_squared_error(y_test, rfpredict))}")
# print(metrics.r2_score(y_test,predict))