<a href="https://colab.research.google.com/github/siddharth9238/Machine_Learning_Projects/blob/main/Taxifare%20Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from geopy import distance
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the data into a Pandas DataFrame
df = pd.read_csv('https://raw.githubusercontent.com/Premalatha-success/Datasets/main/TaxiFare.csv',encoding='utf-8')

In [None]:
#show the first few rows of the DataFrame
df.head()

In [None]:
#show the any 10 rows of the DataFrame
df.sample(10)

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df=pd.get_dummies(df,columns=["no_of_passenger"])

In [None]:
df.dtypes

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
# Remove any rows with missing values
df.dropna(inplace=True)

In [None]:
# Convert the pickup and dropoff coordinates to (lat, long) tuples
pickup_coords = df[['longitude_of_pickup','latitude_of_pickup']].values
dropoff_coords = df[['longitude_of_dropoff','latitude_of_dropoff']].values

In [None]:
# Calculate the distance between pickup and dropoff points using geopy's distance function
distances = [distance.distance(pickup_coords[i], dropoff_coords[i]).km for i in range(len(df))]
df.loc[:, 'distance'] = distances

In [None]:
# Convert pickup_datetime column to datetime type and extract datetime features
df.loc[:, 'date_time_of_pickup'] = pd.to_datetime(df['date_time_of_pickup'])
df.loc[:, 'hour'] = df['date_time_of_pickup'].dt.hour
df.loc[:, 'day'] = df['date_time_of_pickup'].dt.day
df.loc[:, 'month'] = df['date_time_of_pickup'].dt.month

In [None]:
# Convert relevant columns to numeric data types, excluding 'date_time_of_pickup'
df = df.apply(lambda x: pd.to_numeric(x, errors='coerce', downcast='float') if x.name != 'date_time_of_pickup' else x)

In [None]:
df.dtypes

In [None]:
# Remove any rows with fare_amount <= 0
df = df.loc[df['amount'] > 0]

In [None]:
# Plot the distribution of the amount column
sns.displot(df['amount'])

In [None]:
# Remove any rows with amount <= 0
df = df[df['amount'] > 0]

In [None]:
df = df.drop('unique_id', axis=1)
df.dtypes

In [None]:
df.fillna(0, inplace=True)  # fill missing values with 0

In [None]:
# Split the data into training and testing sets
X = df.drop('amount', axis=1)
y = df['amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
#For rescalling the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

In [None]:
# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Make predictions on the testing set
y_pred = model.predict(X_test)

In [None]:
# Compute the root mean squared error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)

In [None]:
# Plot the predicted vs. actual fares
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Fare")
plt.ylabel("Predicted Fare")
plt.show()

In [None]:
df.dtypes

In [None]:
df.boxplot(column="amount")
plt.show()

In [None]:
df.boxplot(column="longitude_of_pickup")
plt.show()

In [None]:
df.boxplot(column="latitude_of_pickup")
plt.show()

In [None]:
df.boxplot(column="longitude_of_dropoff")
plt.show()

In [None]:
df.boxplot(column="latitude_of_dropoff")
plt.show()