# Data Initialization

In [3]:
from google.colab import drive
import os

drive.mount('/content/drive')
os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/MyDrive/kaggle"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


You're setting up access to your Google Drive and letting your code use your Kaggle API key from a folder inside your Drive.

In [4]:
#!/bin/bash
! kaggle datasets download elemento/nyc-yellow-taxi-trip-data

Dataset URL: https://www.kaggle.com/datasets/elemento/nyc-yellow-taxi-trip-data
License(s): U.S. Government Works
Downloading nyc-yellow-taxi-trip-data.zip to /content
 99% 1.77G/1.78G [00:10<00:00, 272MB/s]
100% 1.78G/1.78G [00:10<00:00, 189MB/s]


You’re downloading a dataset from Kaggle directly into Colab using the Kaggle API.

In [5]:
! unzip /content/nyc-yellow-taxi-trip-data.zip

Archive:  /content/nyc-yellow-taxi-trip-data.zip
  inflating: yellow_tripdata_2015-01.csv  
  inflating: yellow_tripdata_2016-01.csv  
  inflating: yellow_tripdata_2016-02.csv  
  inflating: yellow_tripdata_2016-03.csv  


Unzips the downloaded file so you can use the .csv dataset inside.

# Data Prepration

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

Imports the libraries needed for:

Data handling (pandas, numpy)

Visualization (seaborn, matplotlib)

Preprocessing labels (LabelEncoder)

Hides warnings to make output cleaner.

In [7]:
data = pd.read_csv('/content/yellow_tripdata_2016-01.csv')
df = pd.DataFrame(data)
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2016-01-01 00:00:00,2016-01-01 00:00:00,2,1.10,-73.990372,40.734695,1,N,-73.981842,40.732407,2,7.5,0.5,0.5,0.00,0.00,0.3,8.80
1,2,2016-01-01 00:00:00,2016-01-01 00:00:00,5,4.90,-73.980782,40.729912,1,N,-73.944473,40.716679,1,18.0,0.5,0.5,0.00,0.00,0.3,19.30
2,2,2016-01-01 00:00:00,2016-01-01 00:00:00,1,10.54,-73.984550,40.679565,1,N,-73.950272,40.788925,1,33.0,0.5,0.5,0.00,0.00,0.3,34.30
3,2,2016-01-01 00:00:00,2016-01-01 00:00:00,1,4.75,-73.993469,40.718990,1,N,-73.962242,40.657333,2,16.5,0.0,0.5,0.00,0.00,0.3,17.30
4,2,2016-01-01 00:00:00,2016-01-01 00:00:00,3,1.76,-73.960625,40.781330,1,N,-73.977264,40.758514,2,8.0,0.0,0.5,0.00,0.00,0.3,8.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10906853,2,2016-01-31 23:30:32,2016-01-31 23:38:18,1,2.20,-74.003578,40.751011,1,N,-73.982651,40.767509,2,8.5,0.5,0.5,0.00,0.00,0.3,9.80
10906854,1,2016-01-05 00:15:55,2016-01-05 00:16:06,1,0.00,-73.945488,40.751530,1,N,-73.945457,40.751530,2,2.5,0.5,0.5,0.00,0.00,0.3,3.80
10906855,1,2016-01-05 06:12:46,2016-03-19 20:45:50,3,1.40,-73.994240,40.766586,1,N,-73.984428,40.753922,2,7.5,0.5,0.5,0.00,0.00,0.3,8.80
10906856,1,2016-01-05 06:21:44,2016-03-28 12:54:26,1,2.10,-73.948067,40.776531,1,N,-73.978188,40.777435,1,11.5,0.0,0.5,2.45,0.00,0.3,14.75


Loads the taxi trip data CSV file into a pandas DataFrame named df.

Displays the full DataFrame in output (shows a table of the data).

In [8]:
df.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
count,10906860.0,10906860.0,10906860.0,10906860.0,10906860.0,10906860.0,10906860.0,10906860.0,10906860.0,10906860.0,10906860.0,10906860.0,10906860.0,10906860.0,10906860.0,10906860.0
mean,1.535024,1.670847,4.648197,-72.81869,40.11494,1.03935,-72.88659,40.15315,1.347536,12.48693,0.3130757,0.4976705,1.750663,0.2933453,0.2997245,15.6414
std,0.4987718,1.324891,2981.095,9.168964,5.051022,0.5186309,8.900841,4.903456,0.4910804,35.564,0.4156792,0.05046685,2.623546,1.694572,0.01232553,36.4128
min,1.0,0.0,0.0,-121.9343,0.0,1.0,-121.9335,0.0,1.0,-957.6,-42.61,-0.5,-220.8,-17.4,-0.3,-958.4
25%,1.0,1.0,1.0,-73.99151,40.7363,1.0,-73.99107,40.73481,1.0,6.5,0.0,0.5,0.0,0.0,0.3,8.3
50%,2.0,1.0,1.67,-73.98138,40.75369,1.0,-73.97942,40.75413,1.0,9.0,0.0,0.5,1.26,0.0,0.3,11.62
75%,2.0,2.0,3.08,-73.9661,40.76808,1.0,-73.96196,40.76962,2.0,14.0,0.5,0.5,2.32,0.0,0.3,17.16
max,2.0,9.0,8000010.0,0.0,60.90876,99.0,0.0,60.90876,5.0,111270.9,648.87,89.7,998.14,980.15,0.3,111271.6


Gives summary statistics (like mean, min, max) for numeric columns — useful for spotting outliers.

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10906858 entries, 0 to 10906857
Data columns (total 19 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   VendorID               int64  
 1   tpep_pickup_datetime   object 
 2   tpep_dropoff_datetime  object 
 3   passenger_count        int64  
 4   trip_distance          float64
 5   pickup_longitude       float64
 6   pickup_latitude        float64
 7   RatecodeID             int64  
 8   store_and_fwd_flag     object 
 9   dropoff_longitude      float64
 10  dropoff_latitude       float64
 11  payment_type           int64  
 12  fare_amount            float64
 13  extra                  float64
 14  mta_tax                float64
 15  tip_amount             float64
 16  tolls_amount           float64
 17  improvement_surcharge  float64
 18  total_amount           float64
dtypes: float64(12), int64(4), object(3)
memory usage: 1.5+ GB


Gives a summary of the dataset: columns, data types, and how many missing values there are.

In [10]:
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

Converts the pickup and dropoff time columns from string to proper datetime format for later time-based operations.

In [11]:
# Calculate trip duration in minutes
df['trip_duration_minutes'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

Creates a new column trip_duration in minutes, by subtracting pickup time from dropoff time.

In [12]:
df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1, inplace=True)

Removes trips where duration is zero or negative, which are likely errors.

In [13]:
encoder = LabelEncoder()

df['store_and_fwd_flag'] = encoder.fit_transform(df['store_and_fwd_flag'])

Converts the text flag Y/N to numbers 0 and 1 using LabelEncoder.

In [14]:
df

Unnamed: 0,VendorID,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,trip_duration_minutes
0,2,2,1.10,-73.990372,40.734695,1,0,-73.981842,40.732407,2,7.5,0.5,0.5,0.00,0.00,0.3,8.80,0.000000
1,2,5,4.90,-73.980782,40.729912,1,0,-73.944473,40.716679,1,18.0,0.5,0.5,0.00,0.00,0.3,19.30,0.000000
2,2,1,10.54,-73.984550,40.679565,1,0,-73.950272,40.788925,1,33.0,0.5,0.5,0.00,0.00,0.3,34.30,0.000000
3,2,1,4.75,-73.993469,40.718990,1,0,-73.962242,40.657333,2,16.5,0.0,0.5,0.00,0.00,0.3,17.30,0.000000
4,2,3,1.76,-73.960625,40.781330,1,0,-73.977264,40.758514,2,8.0,0.0,0.5,0.00,0.00,0.3,8.80,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10906853,2,1,2.20,-74.003578,40.751011,1,0,-73.982651,40.767509,2,8.5,0.5,0.5,0.00,0.00,0.3,9.80,7.766667
10906854,1,1,0.00,-73.945488,40.751530,1,0,-73.945457,40.751530,2,2.5,0.5,0.5,0.00,0.00,0.3,3.80,0.183333
10906855,1,3,1.40,-73.994240,40.766586,1,0,-73.984428,40.753922,2,7.5,0.5,0.5,0.00,0.00,0.3,8.80,107433.066667
10906856,1,1,2.10,-73.948067,40.776531,1,0,-73.978188,40.777435,1,11.5,0.0,0.5,2.45,0.00,0.3,14.75,119912.700000


In [None]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

🔍 What it does:

df.dropna(inplace=True) removes all rows in the dataframe df that contain any missing (NaN) values.

df.drop_duplicates(inplace=True) removes all duplicate rows in df so that each row is unique.

inplace=True means these changes happen directly to df without creating a new copy.

📌 Why this is critical:

Cleaning data by removing missing and duplicate entries helps ensure the model trains on accurate and consistent data, preventing bias or errors caused by incomplete or repeated data.

In [None]:
df_subset = df.sample(n = 500000, random_state=42)

🔍 What it does:

Randomly selects 500,000 rows from the dataframe df and stores them in a new dataframe df_subset.

random_state=42 ensures the same 500,000 rows are chosen every time this runs (reproducibility).

📌 Why this is critical:

Sampling a subset makes it feasible to work with large datasets by reducing size for faster training and testing, while still maintaining a representative sample of the data.

# Visualization

In [None]:
sns.pairplot(df_subset, hue='total_amount')
plt.savefig('subset_pairplot.png')
plt.show()

🔍 What it does:

Creates a pairplot (scatterplot matrix) of the dataframe subset df_subset. Each pair of numeric variables is plotted against each other.

The points are colored by the value of the total_amount column (hue='total_amount'), helping to see how this variable relates to others visually.

Saves the plot image as 'subset_pairplot.png'.

Displays the plot with plt.show().

📌 Why this is critical:

Pairplots help visualize relationships and correlations between multiple numeric features at once. Coloring by a target or important feature like total_amount helps spot trends or clusters relevant to modeling.

In [None]:
numeric_columns = df.select_dtypes(include=['number']).columns

n_cols = 3
n_rows = (len(numeric_columns) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
axes = axes.flatten()

for i, column in enumerate(numeric_columns):
    sns.histplot(df_subset[column], kde=True, ax=axes[i])
    axes[i].set_title(f'Distribution of {column}')

if len(numeric_columns) < len(axes):
    for j in range(len(numeric_columns), len(axes)):
        axes[j].set_visible(False)


plt.tight_layout()
plt.savefig('numeric_columns_distribution.png')
plt.show()

🔍 What it does:

Finds all numeric columns in the dataframe df.

Sets up a grid of subplots with 3 columns and enough rows to fit all numeric columns.

For each numeric column, plots a histogram with a Kernel Density Estimate (KDE) overlay on its respective subplot, using data from df_subset.

Hides any extra subplot axes if there are fewer numeric columns than grid spots.

Adjusts subplot layout, saves the figure as 'numeric_columns_distribution.png', and displays it.

📌 Why this is critical:

Visualizing the distribution of numeric features helps understand their ranges, skewness, and presence of outliers, guiding feature engineering and model selection.

In [None]:
n_cols = 2
n_rows = (len(numeric_columns) + n_cols - 1) // n_cols
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
axes = axes.flatten()

for i, column in enumerate(numeric_columns):
    sns.boxplot(x=df[column], ax=axes[i])
    axes[i].set_title(f'Box Plot of {column}')

if len(numeric_columns) < len(axes):
    for j in range(len(numeric_columns), len(axes)):
        axes[j].set_visible(False)

plt.tight_layout()
plt.savefig('numeric_columns_boxplots.png')
plt.show()

🔍 What it does:

Sets up a grid of subplots with 2 columns and enough rows for all numeric columns.

For each numeric column, creates a box plot in its subplot using the full dataframe df (note: not df_subset).

Hides extra subplot axes if there are fewer columns than grid spaces.

Adjusts layout, saves the figure as 'numeric_columns_boxplots.png', and displays it.

📌 Why this is critical:

Box plots highlight median, spread, and potential outliers of numeric features, crucial for detecting anomalies and understanding data variability before modeling.

In [None]:
corr = df.corr()

corr

🔍 What it does:

Calculates the correlation matrix for all numeric columns in the dataframe df.

The matrix corr contains correlation coefficients (values between -1 and 1) that quantify the linear relationship between each pair of features.

📌 Why this is critical:

Understanding feature correlations helps identify which variables are strongly related or redundant, guiding feature selection and preventing multicollinearity issues in models.

In [None]:
mask = np.triu(np.ones_like(corr, dtype=bool))

🔍 What it does:

Creates a boolean mask for the upper triangle of the correlation matrix corr.

np.ones_like(corr, dtype=bool) creates a matrix of True values the same shape as corr.

np.triu(...) keeps only the upper triangle part True and sets the rest to False.

📌 Why this is critical:

When visualizing a symmetric matrix like correlations, masking the upper triangle avoids redundant information and makes the heatmap cleaner and easier to read.

In [None]:
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

plt.show()

🔍 What it does:

Sets up a figure and axes for plotting a heatmap of the correlation matrix.

Uses a diverging color palette from seaborn for clear visualization of positive vs negative correlations.

Applies the mask to hide the upper triangle of the matrix.

Limits the max value shown on the color scale to 0.3 for contrast.

Adds grid lines (linewidths=.5), centers the color scale at zero, and formats the color bar to be smaller (shrink=.5).

Displays the heatmap.

📌 Why this is critical:

Heatmaps offer an intuitive, visual way to inspect complex correlation structures quickly, helping identify feature relationships relevant for modeling decisions.

# Data Preprocessing

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

🔍 What it does:

Imports two feature scaling classes from scikit-learn’s preprocessing module:

MinMaxScaler: Scales features to a specified range, usually 0 to 1, by subtracting the minimum and dividing by the feature range.

StandardScaler: Standardizes features by removing the mean and scaling to unit variance (z-score normalization).

📌 Why this is critical:

Feature scaling ensures numeric variables are on comparable scales, which improves model convergence and performance, especially for algorithms sensitive to feature magnitude (like gradient descent, SVM, or KNN).

In [None]:
# Columns to normalize with Min-Max Scaling
min_max_columns = [
    "passenger_count", "trip_distance", "extra", "mta_tax",
    "tolls_amount", "improvement_surcharge", "congestion_surcharge"
]

🔍 What it does:

Selects a subset of columns from the DataFrame.

Keeps only the columns needed for modeling.

Removes columns like raw timestamps which are no longer needed.

📌 Why this matters:
This prepares the dataset for training. You only keep features that:

Are numeric or encoded

Help predict the fare

Don’t introduce noise or unnecessary information

This step is called feature selection.

In [None]:
# Columns to normalize with Standardization
z_score_columns = [
    "fare_amount", "tip_amount", "total_amount", "trip_duration_minutes"
]

🔍 What it does:

Defines a list of column names (z_score_columns) from the dataframe that will be normalized using Z-score standardization. These columns relate to monetary amounts and trip duration.

📌 Why this is critical:

Selecting specific columns ensures you only scale relevant continuous numerical features, avoiding unintended changes to categorical or already scaled data.

In [None]:
# Initialize scalers
min_max_scaler = MinMaxScaler()
z_score_scaler = StandardScaler()

🔍 What it does:

Creates instances of two scaler objects:

min_max_scaler for scaling features to a 0-1 range.

z_score_scaler for standardizing features to have mean 0 and standard deviation 1.

📌 Why this is critical:

Initializing scalers prepares them for fitting and transforming your data, enabling consistent scaling procedures.

In [None]:
# Normalize using Min-Max Scaling
df_subset[min_max_columns] = min_max_scaler.fit_transform(df_subset[min_max_columns])

🔍 What it does:

Applies Min-Max scaling to the columns listed in min_max_columns of df_subset.

.fit_transform() fits the scaler to the data (calculates min and max) and transforms the data accordingly.

The scaled values replace the original values in df_subset.

📌 Why this is critical:

Min-Max scaling compresses feature values into the [0,1] range, which is important for models sensitive to scale and for preserving the relative distribution shape.



In [None]:
# Normalize using Z-Score Standardization
df_subset[z_score_columns] = z_score_scaler.fit_transform(df_subset[z_score_columns])

🔍 What it does:

Applies Z-score standardization to the specified columns in df_subset.

Each feature is transformed to have a mean of 0 and a standard deviation of 1.

The transformed values overwrite the original columns.

📌 Why this is critical:

Standardization centers features and scales them to unit variance, which helps many ML algorithms converge faster and perform better, especially those assuming normally distributed data.

In [None]:
df_subset

🔍 What it does:

Displays the contents of the df_subset dataframe in a notebook or interactive environment like Google Colab or Jupyter.

It shows a preview of the data: typically the first few rows and columns by default (similar to calling print(df_subset.head())).

📌 Why this is critical:

Quickly checking df_subset allows you to visually confirm that preprocessing steps (like cleaning, sampling, and scaling) were applied correctly. It's a simple yet powerful way to debug and explore your data.

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

🔍 What it does:

Imports the function used to split your dataset into training and testing sets.

📌 Why it matters:
You must test your model on new data it hasn’t seen — this avoids overfitting and gives a better measure of how well your model generalizes.

In [None]:
x = df_subset.iloc[:, 8:10]
x

🔍 What it does:

Selects all rows (:) and only the columns at index positions 8 and 9 (since Python slicing is exclusive of the end index) from df_subset.

Stores this slice of the dataframe in a new variable x.

📌 Why this is critical:

Narrowing down specific feature columns prepares the data for modeling. This step isolates the input features you want to use (e.g., perhaps pickup/dropoff coordinates or time-related fields).

In [None]:
y = df_subset['total_amount']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.2, random_state=42)

🔍 What it does:

Splits the dataset into:

80% training data (X_train, y_train)

20% testing data (X_test, y_test)

random_state=42 makes the split reproducible (same split every time you run it).

📌 Why this is critical:
You train your model on the training data, and test it on unseen test data to measure its real-world performance

In [None]:
x_test

🔍 What it does:

Displays the contents of the variable x_test (if it has already been defined) in an interactive environment like Google Colab or Jupyter Notebook.

📌 Why this is critical:

Viewing x_test lets you inspect your test features to ensure that the right data was split or transformed correctly. This step is essential for validating your model input before making predictions.

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error, r2_score

🔍 What it does:

Imports the LinearRegression model from sklearn.linear_model.

Imports common regression evaluation metrics:

mean_squared_error

mean_absolute_error

root_mean_squared_error

r2_score

📌 Why this is critical:

These tools are essential for training a linear regression model and evaluating its performance using standard error and accuracy metrics.

In [None]:
lr = LinearRegression()

🔍 What it does:

Creates an instance of the LinearRegression model and stores it in the variable lr.

📌 Why this is critical:

Initializing the model object is the first step before training (fitting) it to your data.

In [None]:
lr.fit(x_train, y_train)

y_pred_test_lr = lr.predict(x_test)
y_pred_train_lr = lr.predict(x_train)

🔍 What it does:

lr.fit(x_train, y_train) trains the linear regression model on the training data.

y_pred_test_lr stores predictions made on the test data.

y_pred_train_lr stores predictions made on the training data.

📌 Why this is critical:

Model fitting learns the best-fitting line for your data. Generating predictions for both training and test sets allows you to evaluate performance and check for overfitting or underfitting.

In [None]:
print(f"The Mean Squared Error of Test data with Linear regression is {mean_squared_error(y_pred_test_lr, y_test)}")
print(f"The Mean Absolute Error of Test data with Linear regression is {mean_absolute_error(y_pred_test_lr, y_test)}")
print(f"The Root Mean Squared Error of Test data with Linear regression is {root_mean_squared_error(y_pred_test_lr, y_test)}")
print(f"\nThe R2 Score Test data with Linear regression is {r2_score(y_pred_test_lr, y_test)}")

In [None]:
Mean_Squared_Error_lr_test = mean_squared_error(y_pred_test_lr, y_test)
Mean_Squared_Error_lr_train = mean_squared_error(y_pred_train_lr, y_train)
Mean_Absolute_Error_lr_test = mean_absolute_error(y_pred_test_lr, y_test)
Mean_Absolute_Error_lr_train = mean_absolute_error(y_pred_train_lr, y_train)
Root_Mean_Squared_Error_lr_test = root_mean_squared_error(y_pred_test_lr, y_test)
Root_Mean_Squared_Error_lr_train = root_mean_squared_error(y_pred_train_lr, y_train)
R2_Score_Test_data = r2_score(y_pred_test_lr, y_test)
R2_Score_train_data = r2_score(y_pred_train_lr, y_train)

🔍 What it does:

Prints the following evaluation metrics on the test data:

MSE (Mean Squared Error)

MAE (Mean Absolute Error)

RMSE (Root Mean Squared Error)

R² Score (explained variance)

📌 Why this is critical:

These metrics help quantify how well the model performs on unseen data. Lower errors and higher R² scores indicate better predictive power.

In [None]:
print(f"The Mean Squared Error of Train data with Linear regression is {mean_squared_error(y_pred_train_lr, y_train)}")
print(f"The Mean Absolute Error of Train data with Linear regression is {mean_absolute_error(y_pred_train_lr, y_train)}")
print(f"The Root Mean Squared Error of Train data with Linear regression is {root_mean_squared_error(y_pred_train_lr, y_train)}")
print(f"\nThe R2 Score Train data with Linear regression is {r2_score(y_pred_train_lr, y_train)}")

🔍 What it does:

Prints the same set of evaluation metrics, but for the training data instead of test data.

📌 Why this is critical:

Comparing training and test performance reveals if the model is overfitting (too good on train, bad on test) or underfitting (bad on both). A balanced performance suggests good generalization.

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot of actual values
ax.scatter(x_test.iloc[:, 0], x_test.iloc[:, 1], y_test, color='blue', label='Actual')

# Predicted values
ax.scatter(x_test.iloc[:, 0], x_test.iloc[:, 1], y_pred_test_lr, color='red', label='Predicted')

ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.set_zlabel('Total Amount')
ax.legend()
plt.show()

🔍 What it does:

Creates a 3D scatter plot showing the relationship between the two features used in x_test and the actual (y_test) vs predicted values (y_pred_test_lr).

Blue points show actual values; red points show model predictions.

📌 Why this is critical:

3D visualization helps you intuitively see how closely the model’s predictions align with the actual outcomes, especially when working with two input features.

In [None]:
import pandas as pd

coefs = pd.DataFrame({'Feature': x.columns, 'Coefficient': lr.coef_})
coefs = coefs.sort_values(by='Coefficient', ascending=False)

plt.figure(figsize=(8, 5))
plt.barh(coefs['Feature'], coefs['Coefficient'], color='green')
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.title('Feature Importance in Linear Regression')
plt.show()


🔍 What it does:

Extracts the learned coefficients from the linear regression model and pairs them with the feature names.

Sorts them by importance and plots a horizontal bar chart showing each feature’s contribution (positive or negative) to the prediction.

📌 Why this is critical:

Coefficients in linear regression indicate how strongly each feature influences the target. Visualizing them reveals which features are most impactful and in which direction (positive or negative influence).

# Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

🔍 What it does:

Imports the Ridge regression model from sklearn.linear_model.

Ridge regression is a type of regularized linear regression that adds a penalty term to reduce overfitting.

📌 Why this is critical:

Ridge regression improves model generalization by penalizing large coefficients, making it especially useful when features are correlated or when the model overfits the training data.

In [None]:
ridge_reg = Ridge()

🔍 What it does:

Creates an instance of the Ridge regression model using default parameters.

The model is stored in the variable ridge_reg.

📌 Why this is critical:

Initializing the model is the first step before training it. This version of Ridge will use L2 regularization with a default alpha (regularization strength).

In [None]:
ridge_reg.fit(x_train, y_train)

y_pred_train_ridge = ridge_reg.predict(x_train)
y_pred_test_ridge = ridge_reg.predict(x_test)

🔍 What it does:

Fits the Ridge regression model on the training data.

Generates predictions on both the training set (y_pred_train_ridge) and the test set (y_pred_test_ridge).

📌 Why this is critical:

Fitting allows the model to learn patterns from training data.

Predicting on both train and test sets helps evaluate how well the model has learned and whether it's overfitting or generalizing.

In [None]:
print(f"The Mean Squared Error of Test data with Ridge regression is {mean_squared_error(y_pred_test_ridge, y_test)}")
print(f"The Mean Absolute Error of Test data with Ridge regression is {mean_absolute_error(y_pred_test_ridge, y_test)}")
print(f"The Root Mean Squared Error of Test data with Ridge regression is {root_mean_squared_error(y_pred_test_ridge, y_test)}")
print(f"\nThe R2 Score Test data with Ridge regression is {r2_score(y_pred_test_ridge, y_test)}")

In [None]:
Mean_Squared_Error_ridge_test = mean_squared_error(y_pred_test_ridge, y_test)
Mean_Absolute_Error_ridge_test = mean_absolute_error(y_pred_test_ridge, y_test)
Root_Mean_Squared_Error_ridge_test = root_mean_squared_error(y_pred_test_ridge, y_test)
R2_Score_Test_data_ridge = r2_score(y_pred_test_ridge, y_test)

🔍 What it does:

Calculates and prints performance metrics for Ridge regression on the test data:

MSE (Mean Squared Error)

MAE (Mean Absolute Error)

RMSE (Root Mean Squared Error)

R² Score

📌 Why this is critical:

These metrics show how well the Ridge model performs on unseen data, measuring both average error and the percentage of variance explained.

In [None]:
print(f"The Mean Squared Error of Train data with Ridge regression is {mean_squared_error(y_pred_train_ridge, y_train)}")
print(f"The Mean Absolute Error of Train data with Ridge regression is {mean_absolute_error(y_pred_train_ridge, y_train)}")
print(f"The Root Mean Squared Error of Train data with Ridge regression is {root_mean_squared_error(y_pred_train_ridge, y_train)}")
print(f"\nThe R2 Score Train data with Ridge regression is {r2_score(y_pred_train_ridge, y_train)}")

In [None]:
Mean_Squared_Error_ridge_train = mean_squared_error(y_pred_train_ridge, y_train)
Mean_Absolute_Error_ridge_train = mean_absolute_error(y_pred_train_ridge, y_train)
Root_Mean_Squared_Error_ridge_train = root_mean_squared_error(y_pred_train_ridge, y_train)
R2_Score_train_data_ridge = r2_score(y_pred_train_ridge, y_train)

🔍 What it does:

Prints the same evaluation metrics, but for the training data.

📌 Why this is critical:

Comparing training vs. testing scores helps detect overfitting or underfitting. A smaller gap between them usually indicates better generalization, especially with regularization like Ridge.

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot of actual values
ax.scatter(x_test.iloc[:, 0], x_test.iloc[:, 1], y_test, color='blue', label='Actual')

# Predicted values
ax.scatter(x_test.iloc[:, 0], x_test.iloc[:, 1], y_pred_test_ridge, color='red', label='Predicted')

ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.set_zlabel('Total Amount')
ax.legend()
plt.show()

🔍 What it does:

Creates a 3D scatter plot comparing the actual vs. predicted values for the Ridge model using the two features from x_test.

Blue points are real values, red points are predictions.

📌 Why this is critical:

Visual comparison in 3D makes it easy to see how closely predictions align with actual data, especially helpful when using two input features.

In [None]:
coefs = pd.DataFrame({'Feature': x.columns, 'Coefficient': ridge_reg.coef_})
coefs = coefs.sort_values(by='Coefficient', ascending=False)

plt.figure(figsize=(8, 5))
plt.barh(coefs['Feature'], coefs['Coefficient'], color='green')
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.title('Feature Importance in Linear Regression')
plt.show()


🔍 What it does:

Extracts and visualizes the feature coefficients from the Ridge regression model.

Displays them as a horizontal bar chart sorted by their importance (absolute value of the coefficients).

📌 Why this is critical:

Ridge regression shrinks the coefficients to reduce model complexity. Visualizing them helps understand which features are still influential after regularization.

# Lasso Regression

In [None]:
from sklearn.linear_model import Lasso

In [None]:
lasso_reg = Lasso(alpha=0.1)

In [None]:
lasso_reg.fit(x_train, y_train)

y_pred_train_lasso = lasso_reg.predict(x_train)
y_pred_test_lasso = lasso_reg.predict(x_test)

In [None]:
print(f"The Mean Squared Error of Test data with Lasso regression is {mean_squared_error(y_pred_test_lasso, y_test)}")
print(f"The Mean Absolute Error of Test data with Lasso regression is {mean_absolute_error(y_pred_test_lasso, y_test)}")
print(f"The Root Mean Squared Error of Test data with Lasso regression is {root_mean_squared_error(y_pred_test_lasso, y_test)}")
print(f"\nThe R2 Score Test data with Lasso regression is {r2_score(y_pred_test_lasso, y_test)}")

In [None]:
Mean_Squared_Error_lasso_test = mean_squared_error(y_pred_test_lasso, y_test)
Mean_Absolute_Error_lasso_test = mean_absolute_error(y_pred_test_lasso, y_test)
Root_Mean_Squared_Error_lasso_test = root_mean_squared_error(y_pred_test_lasso, y_test)
R2_Score_Test_data_lasso = r2_score(y_pred_test_lasso, y_test)

In [None]:
print(f"The Mean Squared Error of Train data with Ridge regression is {mean_squared_error(y_pred_train_lasso, y_train)}")
print(f"The Mean Absolute Error of Train data with Ridge regression is {mean_absolute_error(y_pred_train_lasso, y_train)}")
print(f"The Root Mean Squared Error of Train data with Ridge regression is {root_mean_squared_error(y_pred_train_lasso, y_train)}")
print(f"\nThe R2 Score Train data with Ridge regression is {r2_score(y_pred_train_lasso, y_train)}")

In [None]:
Mean_Squared_Error_lasso_train = mean_squared_error(y_pred_train_lasso, y_train)
Mean_Absolute_Error_lasso_train = mean_absolute_error(y_pred_train_lasso, y_train)
Root_Mean_Squared_Error_lasso_train = root_mean_squared_error(y_pred_train_lasso, y_train)
R2_Score_train_data_lasso = r2_score(y_pred_train_lasso, y_train)

In [None]:
# Check which features were eliminated
print(f"Remaining features: {sum(lasso_reg.coef_ != 0)}")
print(f"Eliminated features: {sum(lasso_reg.coef_ == 0)}")

🔍 What it does:

Counts and prints how many feature coefficients were retained (non-zero) and how many were set to zero (eliminated) by the Lasso model.

📌 Why this is critical:

Lasso acts as both a model and a feature selector. Knowing which features were removed helps interpret the model and reduce dimensionality for future use.

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot of actual values
ax.scatter(x_test.iloc[:, 0], x_test.iloc[:, 1], y_test, color='blue', label='Actual')

# Predicted values
ax.scatter(x_test.iloc[:, 0], x_test.iloc[:, 1], y_pred_test_lasso, color='red', label='Predicted')

ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.set_zlabel('Total Amount')
ax.legend()
plt.show()

🔍 What it does:

Visualizes the actual vs. predicted values for the Lasso model in 3D using the two features from x_test.

Blue = actual, Red = predicted.

📌 Why this is critical:

This visual comparison helps you see the accuracy of the Lasso model and how predictions follow the true target distribution.

In [None]:
coefs = pd.DataFrame({'Feature': x.columns, 'Coefficient': lasso_reg.coef_})
coefs = coefs.sort_values(by='Coefficient', ascending=False)

plt.figure(figsize=(8, 5))
plt.barh(coefs['Feature'], coefs['Coefficient'], color='green')
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.title('Feature Importance in Linear Regression')
plt.show()


🔍 What it does:

Creates a bar chart of the Lasso coefficients, showing each feature’s weight after L1 regularization.

Sorted by importance.

📌 Why this is critical:

This plot shows how Lasso assigns importance — or zero importance — to features. It's a clear visual representation of which features matter most after regularization.

# Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

🔍 What it does: PolynomialFeatures is a tool to expand your features into higher-degree polynomial terms (e.g.,
�
x becomes
�
,
�
2
,
�
3
,
…
x,x
2
 ,x
3
 ,…).

LinearRegression is the model that fits a linear relationship to the data.

📌 Why this is critical:
Polynomial features allow a linear regression model to fit non-linear relationships. Importing these tools is the first step in building a more flexible model that captures curves in the data.

In [None]:
poly = PolynomialFeatures()

x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.fit_transform(x_test)

🔍 What it does: Initializes a PolynomialFeatures object with default degree 2.

fit_transform(x_train) learns the polynomial structure and transforms x_train into new features:
�
,
�
2
,
�
1
�
2
,
…
x,x
2
 ,x
1
​
 x
2
​
 ,….

Applies the same transformation to x_test.

📌 Why this is critical:
Transforms your simple input features into a richer set that includes interaction terms and powers. This helps a linear model handle non-linear relationships more effectively. Without this step, the model would only capture straight-line trends.

In [None]:
x_train_poly.shape

🔍 What it does:
Prints the dimensions of the transformed x_train_poly matrix, including how many new features were generated.

📌 Why this is critical:
Understanding the shape confirms how many features were added. It also helps debug if your data transformation went as expected (e.g., from 3 features to 10 after expanding to 2nd degree).

In [None]:
poly_reg = LinearRegression()

In [None]:
poly_reg.fit(x_train_poly, y_train)

y_pred_test_poly = poly_reg.predict(x_test_poly)
y_pred_train_poly = poly_reg.predict(x_train_poly)

🔍 What it does:

Creates a linear regression model.

Fits the model to the polynomial-transformed training data.

Predicts on both transformed test and train sets.

📌 Why this is critical:
The model now learns from the polynomial-expanded features, allowing it to fit curves and complex shapes. Predicting on both train and test sets helps assess performance and overfitting.

In [None]:
print(f"The Mean Squared Error of Test data with Polynomial Linear regression is {mean_squared_error(y_pred_test_poly, y_test)}")
print(f"The Mean Absolute Error of Test data with Polynomial Linear regression is {mean_absolute_error(y_pred_test_poly, y_test)}")
print(f"The Root Mean Squared Error of Test data with Polynomial Linear regression is {root_mean_squared_error(y_pred_test_poly, y_test)}")
print(f"\nThe R2 Score Test data with Linear Polynomial regression is {r2_score(y_pred_test_poly, y_test)}")

In [None]:
Mean_Squared_Error_poly_test = mean_squared_error(y_pred_test_poly, y_test)
Mean_Absolute_Error_poly_test = mean_absolute_error(y_pred_test_poly, y_test)
Root_Mean_Squared_Error_poly_test = root_mean_squared_error(y_pred_test_poly, y_test)
R2_Score_Test_data_poly = r2_score(y_pred_test_poly, y_test)

🔍 What it does:
Prints four evaluation metrics for test data using polynomial regression:

MSE: Average squared prediction error.

MAE: Average absolute error.

RMSE: Square root of MSE, more interpretable.

R² Score: How well the model explains variance in the target.

📌 Why this is critical:
These metrics give you a complete picture of your model’s performance on unseen data. If these scores are high (especially R²) and errors low, your polynomial model generalizes well.

In [None]:
print(f"The Mean Squared Error of Train data with Linear regression is {mean_squared_error(y_pred_train_lr, y_train)}")
print(f"The Mean Absolute Error of Train data with Linear regression is {mean_absolute_error(y_pred_train_lr, y_train)}")
print(f"The Root Mean Squared Error of Train data with Linear regression is {root_mean_squared_error(y_pred_train_lr, y_train)}")
print(f"\nThe R2 Score Train data with Linear regression is {r2_score(y_pred_train_lr, y_train)}")

In [None]:
Mean_Squared_Error_poly_train = mean_squared_error(y_pred_train_poly, y_train)
Mean_Absolute_Error_poly_train = mean_absolute_error(y_pred_train_poly, y_train)
Root_Mean_Squared_Error_poly_train = root_mean_squared_error(y_pred_train_poly, y_train)
R2_Score_Test_data_poly_train = r2_score(y_pred_train_poly, y_train)

🔍 What it does:
Prints the same metrics as above, but for the original linear regression model (before polynomial transformation) on training data.

📌 Why this is critical:
This allows you to compare the basic linear model against the polynomial version. If the polynomial model performs significantly better, it shows your data has non-linear patterns worth capturing.

# K Nearest Neighbor Regressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor

🔍 What it does:
Imports the KNeighborsRegressor class from Scikit-learn, which implements KNN for regression tasks.

📌 Why this is critical:
KNN is a non-parametric, instance-based method. It predicts the target value of a new point by averaging the values of its

k nearest neighbors in the training data.

In [None]:
knn_reg = KNeighborsRegressor()
knn_reg.fit(x_train, y_train)

🔍 What it does:

Instantiates a KNN regression model with default parameters (n_neighbors=5).

Trains the model by memorizing the training data (no actual fitting, just storing examples).

📌 Why this is critical:
KNN doesn’t “learn” a function in the traditional sense—it saves the training data and uses it to make predictions at test time by computing distances between points. Simple and effective when features are well-scaled and not too high-dimensional.

In [None]:
y_pred_test_knn = knn_reg.predict(x_test)
y_pred_train_knn = knn_reg.predict(x_train)

🔍 What it does:

Predicts target values for both test and training datasets.

For each point, finds the
�
k nearest neighbors and averages their target values.

📌 Why this is critical:
Used for evaluating model performance. Predictions on both train and test sets help you measure how well the model memorizes and generalizes.

In [None]:
print(f"The Mean Squared Error of Test data with KNN Regression is {mean_squared_error(y_pred_test_knn, y_test)}")
print(f"The Mean Absolute Error of Test data with KNN Regression is {mean_absolute_error(y_pred_test_knn, y_test)}")
print(f"The Root Mean Squared Error of Test data with KNN Regression is {root_mean_squared_error(y_pred_test_knn, y_test)}")
print(f"\nThe R2 Score Test data with KNN Regression is {r2_score(y_pred_test_knn, y_test)}")

In [None]:
Mean_Squared_Error_knn_test = mean_squared_error(y_pred_test_knn, y_test)
Mean_Absolute_Error_knn_test = mean_absolute_error(y_pred_test_knn, y_test)
Root_Mean_Squared_Error_knn_test = root_mean_squared_error(y_pred_test_knn, y_test)
R2_Score_Test_data_knn = r2_score(y_pred_test_knn, y_test)

🔍 What it does:
Prints standard performance metrics on the test dataset:

MSE: Average squared prediction error.

MAE: Average absolute difference from true values.

RMSE: Easier-to-interpret version of MSE.

R²: Proportion of variance explained by the model.

📌 Why this is critical:
These metrics help you understand how accurate your predictions are on unseen data. Low errors and high R² indicate good generalization.

In [None]:
print(f"The Mean Squared Error of Train data with KNN Regression is {mean_squared_error(y_pred_train_knn, y_train)}")
print(f"The Mean Absolute Error of Train data with KNN Regression is {mean_absolute_error(y_pred_train_knn, y_train)}")
print(f"The Root Mean Squared Error of Train data with KNN Regression is {root_mean_squared_error(y_pred_train_knn, y_train)}")
print(f"\nThe R2 Score Train data with KNN Regression is {r2_score(y_pred_train_knn, y_train)}")

In [None]:
Mean_Squared_Error_knn_train = mean_squared_error(y_pred_train_knn, y_train)
Mean_Absolute_Error_knn_train = mean_absolute_error(y_pred_train_knn, y_train)
Root_Mean_Squared_Error_knn_train = root_mean_squared_error(y_pred_train_knn, y_train)
R2_Score_train_data_knn = r2_score(y_pred_train_knn, y_train)

🔍 What it does:
Calculates and prints the same evaluation metrics for the training data.

📌 Why this is critical:
Comparison between train and test errors helps you diagnose:

Overfitting: Very low train error, high test error.

Underfitting: High errors on both.

Good generalization: Similar and low errors.

# Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

🔍 What it does:
Imports the DecisionTreeRegressor class from Scikit-learn.

📌 Why this is critical:
A Decision Tree Regressor splits your dataset into decision rules based on feature values to predict continuous output. It’s simple yet powerful—especially when your data has clear boundaries or non-linear relationships.

In [None]:
dt_reg = DecisionTreeRegressor()
dt_reg.fit(x_train, y_train)

🔍 What it does:

Instantiates a Decision Tree with default parameters.

Fits the model by building a tree of decisions that maps features in x_train to target values in y_train.

📌 Why this is critical:
Training constructs the actual decision tree by recursively splitting data to minimize prediction error. This step builds the core logic the model uses to make predictions.

In [None]:
y_pred_test_dt = dt_reg.predict(x_test)
y_pred_train_dt = dt_reg.predict(x_train)

🔍 What it does:
Generates predicted values using the trained decision tree for both testing and training sets.

📌 Why this is critical:
You need these predictions to evaluate how well the model performs on unseen data (test) and on what it has already learned (train).

In [None]:
print(f"The Mean Squared Error of Test data with Decision Tree regression is {mean_squared_error(y_pred_test_dt, y_test)}")
print(f"The Mean Absolute Error of Test data with Decision Tree regression is {mean_absolute_error(y_pred_test_dt, y_test)}")
print(f"The Root Mean Squared Error of Test data with Decision Tree regression is {root_mean_squared_error(y_pred_test_dt, y_test)}")
print(f"\nThe R2 Score Test data with Decision Tree regression is {r2_score(y_pred_test_dt, y_test)}")

In [None]:
Mean_Squared_Error_dt_test = mean_squared_error(y_pred_test_dt, y_test)
Mean_Absolute_Error_dt_test = mean_absolute_error(y_pred_test_dt, y_test)
Root_Mean_Squared_Error_dt_test = root_mean_squared_error(y_pred_test_dt, y_test)
R2_Score_Test_data_dt = r2_score(y_pred_test_dt, y_test)

🔍 What it does:
Prints the performance metrics for the test set.

📌 Why this is critical:

Decision Trees can easily overfit: they perform perfectly on training data but poorly on test data.

These metrics reveal whether the model has generalized well or is too tailored to the training set.

In [None]:
print(f"The Mean Squared Error of Train data with Decision Tree regression is {mean_squared_error(y_pred_train_dt, y_train)}")
print(f"The Mean Absolute Error of Train data with Decision Tree regression is {mean_absolute_error(y_pred_train_dt, y_train)}")
print(f"The Root Mean Squared Error of Train data with Decision Tree regression is {root_mean_squared_error(y_pred_train_dt, y_train)}")
print(f"\nThe R2 Score Train data with Decision Tree regression is {r2_score(y_pred_train_dt, y_train)}")

In [None]:
Mean_Squared_Error_dt_train = mean_squared_error(y_pred_train_dt, y_train)
Mean_Absolute_Error_dt_train = mean_absolute_error(y_pred_train_dt, y_train)
Root_Mean_Squared_Error_dt_train = root_mean_squared_error(y_pred_train_dt, y_train)
R2_Score_train_data_dt = r2_score(y_pred_train_dt, y_train)

🔍 What it does:
Prints the same metrics as above, but on the training set.

📌 Why this is critical:

Very low training error with high test error usually means overfitting.

Comparing these values helps you diagnose whether to apply tree pruning or use ensemble methods like Random Forest.

In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize=(20,10))
plot_tree(dt_reg, filled=True, feature_names=df.columns)
plt.show()

🔍 What it does:
Plots the full decision tree with feature names and decision values.

📌 Why this is critical:
This gives you insight into how decisions are made—which features and values the tree splits on. It's great for interpretability, especially for small to medium-sized trees.

In [None]:

feature_importance = dt_reg.feature_importances_
features = df_subset.columns


sorted_idx = np.argsort(feature_importance)

plt.figure(figsize=(12, 6))
sns.barplot(x=feature_importance[sorted_idx], y=features[sorted_idx], palette="viridis")
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Feature Importance in Decision Tree")
plt.show()


🔍 What it does:
Plots the relative importance of each feature in the decision tree.

📌 Why this is critical:
This helps you:

Identify which features influence predictions the most.

Remove unimportant features to reduce complexity or overfitting.

Compare models (e.g., how SVM vs. tree treats features).

# SVR(Support Vector Regression)

In [None]:
from sklearn.svm import SVR

🔍 What it does:
Imports the SVR class (Support Vector Regression) from Scikit-learn's support vector machine module.

📌 Why this is critical:
SVR is a powerful algorithm that fits a regression line (or curve) within a margin of tolerance. It’s great for small- to medium-sized datasets, and works well in high-dimensional spaces.

In [None]:
svr = SVR()

🔍 What it does:
Creates an SVR model with default hyperparameters:

Kernel: 'rbf' (radial basis function)

C (penalty): 1.0

Epsilon (tolerance): 0.1

📌 Why this is critical:
The model is now ready to be trained. Default settings are often a starting point, but performance can be improved significantly by tuning C, epsilon, and kernel.

In [None]:
svr.fit(x_train, y_train)

🔍 What it does:
Fits the SVR model to the training data by finding a regression function that keeps errors within a certain margin (epsilon) as much as possible.

📌 Why this is critical:
This is the learning step. SVR doesn’t just minimize the error—it tries to balance the error and model complexity, making it less prone to overfitting.



In [None]:
y_pred_train_svr = svr.predict(x_train)

🔍 What it does:
Predicts the outputs for the same data the model was trained on.

📌 Why this is critical:
Used to assess how well the model fits the training set. Important for diagnosing underfitting or overfitting when compared with test performance.

In [None]:
y_pred_test_svr= svr.predict(x_test)

🔍 What it does:
Uses the trained SVR model to predict outcomes for unseen test data.

📌 Why this is critical:
Shows how well the model generalizes to new data. Generalization is the goal of all machine learning models.

In [None]:
print(f"The Mean Squared Error of Test data with svr is {mean_squared_error(y_pred_test_svr, y_test)}")
print(f"The Mean Absolute Error of Test data with svr is {mean_absolute_error(y_pred_test_svr, y_test)}")
print(f"The Root Mean Squared Error of Test data with svr is {root_mean_squared_error(y_pred_test_svr, y_test)}")
print(f"\nThe R2 Score Test data with svr is {r2_score(y_pred_test_svr, y_test)}")

In [None]:
Mean_Squared_Error_svr_test = mean_squared_error(y_pred_test_svr, y_test)
Mean_Absolute_Error_svr_test = mean_absolute_error(y_pred_test_svr, y_test)
Root_Mean_Squared_Error_svr_test = root_mean_squared_error(y_pred_test_svr, y_test)
R2_Score_Test_data_svr = r2_score(y_pred_test_svr, y_test)

🔍 What it does:
Prints common error metrics and the
R
2
  score for the test set.

📌 Why this is critical:
These values help you measure prediction quality:

MSE/MAE/RMSE tell you how far predictions are from actual values.

R² indicates how much variance in the target variable is explained by the model. A value close to 1 is ideal.

In [None]:
print(f"The Mean Squared Error of Train data with svr  is {mean_squared_error(y_pred_train_svr, y_train)}")
print(f"The Mean Absolute Error of Train data with svr  is {mean_absolute_error(y_pred_train_svr, y_train)}")
print(f"The Root Mean Squared Error of Train data with svr  is {root_mean_squared_error(y_pred_train_svr, y_train)}")
print(f"\nThe R2 Score Train data with svr  is {r2_score(y_pred_train_svr, y_train)}")

In [None]:
Mean_Squared_Error_svr_train = mean_squared_error(y_pred_train_svr, y_train)
Mean_Absolute_Error_svr_train = mean_absolute_error(y_pred_train_svr, y_train)
Root_Mean_Squared_Error_svr_train = root_mean_squared_error(y_pred_train_svr, y_train)
R2_Score_Train_data_svr = r2_score(y_pred_train_svr, y_train)

🔍 What it does:
Evaluates the same metrics for the training set.

📌 Why this is critical:
This comparison helps spot overfitting or underfitting:

If train score is very good but test is poor → overfitting.

If both scores are low → underfitting.

If both are high and similar → good model fit.

# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

🔍 What it does:
Imports RandomForestRegressor, an ensemble learning method that builds multiple decision trees and averages their predictions.

📌 Why this is critical:
Random Forests are robust, powerful, and handle non-linearity well. They often outperform single models and are less prone to overfitting than individual decision trees.

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

🔍 What it does:
Creates a dictionary of settings for GridSearchCV to try:

n_estimators: how many trees in the forest.

max_depth: how deep each tree can grow.

min_samples_split & min_samples_leaf: control how trees split, affecting overfitting.

max_features: number of features considered at each split.

bootstrap: whether to use bootstrapping for sampling.

📌 Why this is critical:
Random forests have many hyperparameters, and tuning them is key to balancing bias vs variance and improving generalization.



In [None]:
rfr = RandomForestRegressor()

🔍 What it does:
Initializes a random forest model with default settings (not yet trained or tuned).

📌 Why this is critical:
You need a base model to pass into the GridSearchCV. The actual training and tuning happen in the next steps.

In [None]:
grid_search = GridSearchCV(rfr, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')

🔍 What it does:

Uses 3-fold cross-validation to try every combination of param_grid.

n_jobs=-1: uses all CPU cores to speed up search.

scoring='neg_mean_squared_error': lower MSE = better performance.

📌 Why this is critical:
This systematically searches for the best configuration of hyperparameters that minimizes error on validation splits.

In [None]:
grid_search.fit(x_train, y_train)

🔍 What it does:
Trains models on different hyperparameter combinations and evaluates their performance using cross-validation.

📌 Why this is critical:
This step finds the best-performing model configuration based on training data, avoiding the need for manual trial-and-error.

In [None]:
print("Best parameters:", grid_search.best_params_)

🔍 What it does:
Outputs the best combination of hyperparameters found during the grid search.

📌 Why this is critical:
These parameters should be used to retrain the final model. Using default settings afterward (as in line 7) ignores this optimized configuration.

In [None]:
best_rfr = RandomForestRegressor()

🔍 What it does:
Creates a new Random Forest model with default parameters again.

📌 Why this is critical (⚠️ Warning):
You should use the best parameters found by GridSearchCV. Right now, you're training a model with default, not optimal, settings.

In [None]:
best_rfr.fit(x_train,y_train)

🔍 What it does:
Fits the random forest model to the training data.

📌 Why this is critical:
The model now learns patterns from the training set, using multiple decision trees and averaging their predictions.

In [None]:
y_pred_test_rfr= best_rfr.predict(x_test)

🔍 What it does:
Generates predictions on unseen test data.

📌 Why this is critical:
These predictions are used to evaluate how well your trained model performs on unseen data.

In [None]:
y_pred_train_rfr= best_rfr.predict(x_train)

🔍 What it does:
Makes predictions on the training data the model was trained on.

📌 Why this is critical:
Used to check for overfitting: if train error is much lower than test error, your model may not generalize well.



In [None]:
print(f"The Mean Squared Error of Test data with rfr is {mean_squared_error(y_pred_test_rfr, y_test)}")
print(f"The Mean Absolute Error of Test data with rfr is {mean_absolute_error(y_pred_test_rfr, y_test)}")
print(f"The Root Mean Squared Error of Test data with rfr is {root_mean_squared_error(y_pred_test_rfr, y_test)}")
print(f"\nThe R2 Score Test data with rfr is {r2_score(y_pred_test_rfr, y_test)}")

In [None]:
Mean_Squared_Error_rfr_test = mean_squared_error(y_pred_test_rfr, y_test)
Mean_Absolute_Error_rfr_test = mean_absolute_error(y_pred_test_rfr, y_test)
Root_Mean_Squared_Error_rfr_test = root_mean_squared_error(y_pred_test_rfr, y_test)
R2_Score_Test_data_rfr = r2_score(y_pred_test_rfr, y_test)

🔍 What it does:
Prints error metrics and


R
2
  score for test set.

📌 Why this is critical:
These numbers tell you how well your model generalizes. You want low error values and high R², which means more variance in the target is explained.

In [None]:
print(f"The Mean Squared Error of Train data with rfr  is {mean_squared_error(y_pred_train_rfr, y_train)}")
print(f"The Mean Absolute Error of Train data with rfr  is {mean_absolute_error(y_pred_train_rfr, y_train)}")
print(f"The Root Mean Squared Error of Train data with rfr  is {root_mean_squared_error(y_pred_train_rfr, y_train)}")
print(f"\nThe R2 Score Train data with rfr  is {r2_score(y_pred_train_rfr, y_train)}")

In [None]:
Mean_Squared_Error_rfr_train = mean_squared_error(y_pred_train_rfr, y_train)
Mean_Absolute_Error_rfr_train = mean_absolute_error(y_pred_train_rfr, y_train)
Root_Mean_Squared_Error_rfr_train = root_mean_squared_error(y_pred_train_rfr, y_train)
R2_Score_Train_data_rfr = r2_score(y_pred_train_rfr, y_train)

🔍 What it does:
Prints same metrics for the training set.

📌 Why this is critical:
Comparing with test scores helps detect overfitting. For Random Forests, it's common to have near-perfect training accuracy—but too much of a gap is a red flag.

# MLP regression

In [None]:
from sklearn.neural_network import MLPRegressor

🔍 What it does:
Imports MLPRegressor, a feedforward neural network model for regression tasks from Scikit-learn.

📌 Why this is critical:
This model can learn complex non-linear relationships by adjusting multiple layers of neurons—more powerful than linear models, especially with enough data.

In [None]:
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50), (100, 100)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam', 'lbfgs', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
}

🔍 What it does:
Creates a dictionary of hyperparameters to try for tuning the MLPRegressor:

hidden_layer_sizes: different architectures (1 or 2 layers with different neuron counts).

activation: function applied to neuron outputs (e.g., ReLU, tanh, logistic).

solver: optimization algorithm (e.g., Adam, L-BFGS, SGD).

alpha: regularization strength to prevent overfitting.

learning_rate: strategy for updating weights.

📌 Why this is critical:
Hyperparameter tuning is essential for neural networks. The right combination can dramatically improve performance. Without it, the model might underfit or overfit.

In [None]:
mlp = MLPRegressor()

🔍 What it does:
Instantiates a base MLPRegressor model using default parameters.

📌 Why this is critical:
You need a base model to pass into the GridSearchCV for tuning. This step prepares the model structure but doesn't train it yet.

In [None]:
grid_search = GridSearchCV(mlp, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')

🔍 What it does:

Runs grid search over all combinations in param_grid.

Uses 3-fold cross-validation to test performance.

n_jobs=-1: uses all CPU cores to speed up search.

scoring='neg_mean_squared_error': the lower the MSE, the better the model.

📌 Why this is critical:
This step automates model tuning. It finds the best combination of hyperparameters based on test performance—essential for reliable and accurate predictions.

In [None]:
print("Best parameters:", grid_search.best_params_)

🔍 What it does:
Prints the best combination of hyperparameters found by GridSearchCV.

📌 Why this is critical:
You now know which architecture and settings give the lowest error on validation data. This helps guide how you build the final model.

In [None]:
best_mlp= MLPRegressor()

🔍 What it does:
Creates a new MLPRegressor. Note: You should ideally initialize it with the best_params_ found above.

📌 Why this is critical:
This step sets up the model that you’ll train and evaluate.

In [None]:
best_mlp.fit(x_train,y_train)

🔍 What it does:
Trains the neural network on the training dataset using backpropagation.

📌 Why this is critical:
This is where the model learns the mapping from input features to the target variable by adjusting internal weights over multiple iterations (epochs).

In [None]:
y_pred_test_mlp= best_mlp.predict(x_test)

🔍 What it does:
Generates predictions on the test dataset using the trained neural network.

📌 Why this is critical:
Test predictions are used to evaluate how well the model generalizes to unseen data.

In [None]:
y_pred_train_mlp= best_mlp.predict(x_train)

🔍 What it does:
Generates predictions on the same data the model was trained on.

📌 Why this is critical:
Used to evaluate overfitting: If training error is low but test error is high, the model is too tailored to the training data.



In [None]:
print(f"The Mean Squared Error of Test data with mlp is {mean_squared_error(y_pred_test_mlp, y_test)}")
print(f"The Mean Absolute Error of Test data with mlp is {mean_absolute_error(y_pred_test_mlp, y_test)}")
print(f"The Root Mean Squared Error of Test data with mlp is {root_mean_squared_error(y_pred_test_mlp, y_test)}")
print(f"\nThe R2 Score Test data with mlp is {r2_score(y_pred_test_mlp, y_test)}")

🔍 What it does:
Prints standard error metrics and
R
2
  score for the test data.

📌 Why this is critical:
These values tell you how well your neural network performs on unseen data. A good model should have:

Low MSE, MAE, RMSE.

High
R
2
 , ideally close to 1.

In [None]:
print(f"The Mean Squared Error of Train data with mlp  is {mean_squared_error(y_pred_train_mlp, y_train)}")
print(f"The Mean Absolute Error of Train data with mlp  is {mean_absolute_error(y_pred_train_mlp, y_train)}")
print(f"The Root Mean Squared Error of Train data with mlp  is {root_mean_squared_error(y_pred_train_mlp, y_train)}")
print(f"\nThe R2 Score Train data with mlp  is {r2_score(y_pred_train_mlp, y_train)}")

🔍 What it does:
Prints the same metrics for training data.

📌 Why this is critical:
By comparing train vs. test performance, you can identify:

Overfitting: Low train error, high test error.

Underfitting: High errors on both train and test.

Well-fit model: Similar and low errors on both.

In [None]:
import pandas as pd

metrics = {
    "Model": ["Linear", "Ridge", "Lasso", "Polynomial", "KNN", "Decision Tree", "SVR", "Random Forest"],
    "MSE Test": [
        Mean_Squared_Error_lr_test,
        Mean_Squared_Error_ridge_test,
        Mean_Squared_Error_lasso_test,
        Mean_Squared_Error_poly_test,
        Mean_Squared_Error_knn_test,
        Mean_Squared_Error_dt_test,
        Mean_Squared_Error_svr_test,
        Mean_Squared_Error_rfr_test
    ],
    "MSE Train": [
        Mean_Squared_Error_lr_train,
        Mean_Squared_Error_ridge_train,
        Mean_Squared_Error_lasso_train,
        Mean_Squared_Error_poly_train,
        Mean_Squared_Error_knn_train,
        Mean_Squared_Error_dt_train,
        Mean_Squared_Error_svr_train,
        Mean_Squared_Error_rfr_train
    ],
    "MAE Test": [
        Mean_Absolute_Error_lr_test,
        Mean_Absolute_Error_ridge_test,
        Mean_Absolute_Error_lasso_test,
        Mean_Absolute_Error_poly_test,
        Mean_Absolute_Error_knn_test,
        Mean_Absolute_Error_dt_test,
        Mean_Absolute_Error_svr_test,
        Mean_Absolute_Error_rfr_test
    ],
    "MAE Train": [
        Mean_Absolute_Error_lr_train,
        Mean_Absolute_Error_ridge_train,
        Mean_Absolute_Error_lasso_train,
        Mean_Absolute_Error_poly_train,
        Mean_Absolute_Error_knn_train,
        Mean_Absolute_Error_dt_train,
        Mean_Absolute_Error_svr_train,
        Mean_Absolute_Error_rfr_train
    ],
    "RMSE Test": [
        Root_Mean_Squared_Error_lr_test,
        Root_Mean_Squared_Error_ridge_test,
        Root_Mean_Squared_Error_lasso_test,
        Root_Mean_Squared_Error_poly_test,
        Root_Mean_Squared_Error_knn_test,
        Root_Mean_Squared_Error_dt_test,
        Root_Mean_Squared_Error_svr_test,
        Root_Mean_Squared_Error_rfr_test
    ],
    "RMSE Train": [
        Root_Mean_Squared_Error_lr_train,
        Root_Mean_Squared_Error_ridge_train,
        Root_Mean_Squared_Error_lasso_train,
        Root_Mean_Squared_Error_poly_train,
        Root_Mean_Squared_Error_knn_train,
        Root_Mean_Squared_Error_dt_train,
        Root_Mean_Squared_Error_svr_train,
        Root_Mean_Squared_Error_rfr_train
    ],
    "R2 Test": [
        R2_Score_Test_data,
        R2_Score_Test_data_ridge,
        R2_Score_Test_data_lasso,
        R2_Score_Test_data_poly,
        R2_Score_Test_data_knn,
        R2_Score_Test_data_dt,
        R2_Score_Test_data_svr,
        R2_Score_Test_data_rfr
    ],
    "R2 Train": [
        R2_Score_train_data,
        R2_Score_train_data_ridge,
        R2_Score_train_data_lasso,
        R2_Score_Test_data_poly_train,  # You might want to rename this to "train"
        R2_Score_train_data_knn,
        R2_Score_train_data_dt,
        R2_Score_Train_data_svr,
        R2_Score_Train_data_rfr
    ]
}

df_metrics = pd.DataFrame(metrics)
display(df_metrics)



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style
sns.set(style="whitegrid")

# Plot R2 scores
plt.figure(figsize=(12, 6))
sns.barplot(x="Model", y="R2 Test", data=df_metrics, color='salmon', label="Test")
sns.barplot(x="Model", y="R2 Train", data=df_metrics, color='lightblue', label="Train")
plt.title("R² Score Comparison")
plt.ylabel("R² Score")
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()