<a href="https://colab.research.google.com/github/sarab421/ML-Regression-Algorithms/blob/main/regressionalgorithmscomparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading and Preprocessing the Data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Assignment1 8th semester/House_Price.csv')

# Display the first few rows of the dataset
print(df.head())

# Separate features and target variable
X = df.drop('price', axis=1)
y = df['price']

# Define categorical and numerical columns
categorical_columns = ['airport', 'waterbody', 'bus_ter']
numerical_columns = [col for col in X.columns if col not in categorical_columns]

# Preprocessing for numerical data: impute missing values and scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data: impute missing values and one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


   price  crime_rate  resid_area  air_qual  room_num   age  dist1  dist2  \
0   24.0     0.00632       32.31     0.538     6.575  65.2   4.35   3.81   
1   21.6     0.02731       37.07     0.469     6.421  78.9   4.99   4.70   
2   34.7     0.02729       37.07     0.469     7.185  61.1   5.03   4.86   
3   33.4     0.03237       32.18     0.458     6.998  45.8   6.21   5.93   
4   36.2     0.06905       32.18     0.458     7.147  54.2   6.16   5.86   

   dist3  dist4  teachers  poor_prop airport  n_hos_beds  n_hot_rooms  \
0   4.18   4.01      24.7       4.98     YES       5.480      11.1920   
1   5.12   5.06      22.2       9.14      NO       7.332      12.1728   
2   5.01   4.97      22.2       4.03      NO       7.394     101.1200   
3   6.16   5.96      21.3       2.94     YES       9.268      11.2672   
4   6.37   5.86      21.3       5.33      NO       8.824      11.2896   

  waterbody  rainfall bus_ter     parks  
0     River        23     YES  0.049347  
1      Lake        4

Training with Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Initialize the Decision Tree regressor
dt_reg = DecisionTreeRegressor()

# Create a pipeline for the Decision Tree regressor
pipeline_dt = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', dt_reg)])

# Train the model
pipeline_dt.fit(X_train, y_train)

# Predict on the test set
y_pred_dt = pipeline_dt.predict(X_test)

# Evaluate the model
mse_dt = mean_squared_error(y_test, y_pred_dt)
print(f'Decision Tree Regression Mean Squared Error: {mse_dt}')


Decision Tree Regression Mean Squared Error: 15.636372549019608


Training With Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
lr = LinearRegression()

# Create a pipeline for the Linear Regression model
pipeline_lr = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', lr)])

# Train the model
pipeline_lr.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = pipeline_lr.predict(X_test)

# Evaluate the model
mse_lr = mean_squared_error(y_test, y_pred_lr)
print(f'Linear Regression Mean Squared Error: {mse_lr}')


Linear Regression Mean Squared Error: 26.040081169598626


Training With KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor

# Initialize the KNN regressor
knn_reg = KNeighborsRegressor(n_neighbors=6)

# Create a pipeline for the KNN regressor
pipeline_knn = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', knn_reg)])

# Train the model
pipeline_knn.fit(X_train, y_train)

# Predict on the test set
y_pred_knn = pipeline_knn.predict(X_test)

# Evaluate the model
mse_knn = mean_squared_error(y_test, y_pred_knn)
print(f'K-Nearest Neighbors Regression Mean Squared Error: {mse_knn}')


K-Nearest Neighbors Regression Mean Squared Error: 22.180514705882352


# Finally Comparing

In [None]:
print(f'Decision Tree Regression Mean Squared Error: {mse_dt}')
print(f'Linear Regression Mean Squared Error: {mse_lr}')
print(f'K-Nearest Neighbors Regression Mean Squared Error: {mse_knn}')


Decision Tree Regression Mean Squared Error: 15.636372549019608
Linear Regression Mean Squared Error: 26.040081169598626
K-Nearest Neighbors Regression Mean Squared Error: 22.180514705882352
