In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# improting the models
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor

# pipeline library
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# importing the metrics
import sklearn.metrics as metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [None]:
# load the data
df=pd.read_csv('train.csv')

In [None]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [None]:
df.shape

(1460, 81)

*In the dataset there are many NULL value in many column . So we will eliminate that column .*

In [None]:
# Code for eliminate the column

missing_thresold=0.5
clean_df=df[df.columns[df.isnull().mean()<missing_thresold]]

clean_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [None]:
# Drop columns with a single unique value
new_df = clean_df[[col for col in clean_df if clean_df[col].nunique() > 1]]

In [None]:
new_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [None]:
new_df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 76, dtype: int64

In [None]:
# divide the data into X and Y

X=df.drop(['SalePrice','Id'],axis=1)
y=df['SalePrice']

In [None]:
# Distingush between numerical and categorical data

numerical_col=X.select_dtypes(include=['int64','float64']).columns
categorical_col=X.select_dtypes(include='object').columns

In [None]:
# Create a Pipeline for numerical column
numerical_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

In [None]:
# create a pipeline for categorical column
categorical_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('labelencode',OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
preprocesing=ColumnTransformer(transformers=[
    ('numerical',numerical_transformer,numerical_col),
    ('categorical',categorical_transformer,categorical_col)
])

In [None]:
# list all the models for the prediction

# Create instances of each model
regressors = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('Support Vector Regressor', SVR()),
    ('Decision Tree Regressor', DecisionTreeRegressor()),
    ('Random Forest Regressor', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('Gradient Boosting Regressor', GradientBoostingRegressor()),
    ('AdaBoost Regressor', AdaBoostRegressor()),
    ('KNeighbors Regressor', KNeighborsRegressor())
]


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train each model and evaluate
results = []



In [None]:
for name, model in regressors:
    # Create pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocesing),
        ('regressor', model)
    ])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    results.append((name, mae, mse, rmse))

In [None]:
results

[('Linear Regression',
  18287.88167363158,
  868713884.8146876,
  29473.952650004165),
 ('Ridge Regression', 19003.84885712347, 890544762.370021, 29841.99662170782),
 ('Lasso Regression',
  17921.055465325546,
  797537164.4762504,
  28240.70049549498),
 ('Support Vector Regressor',
  59514.653063875405,
  7854451117.644461,
  88625.34128365577),
 ('Decision Tree Regressor',
  28655.11301369863,
  1892541084.7979453,
  43503.34567361395),
 ('Random Forest Regressor',
  17408.137191780825,
  808384987.248317,
  28432.11190271164),
 ('Gradient Boosting Regressor',
  16622.38124999856,
  688705646.9840558,
  26243.20191943155),
 ('AdaBoost Regressor',
  24349.642758728638,
  1170444247.1792407,
  34211.75597918413),
 ('KNeighbors Regressor',
  22604.31095890411,
  1499188293.1435616,
  38719.35295357557)]

In [None]:
results_df=pd.DataFrame(results,columns=['Model','MAE','MSE','RSME'])

In [None]:
results_df

Unnamed: 0,Model,MAE,MSE,RSME
0,Linear Regression,18287.881674,868713900.0,29473.95265
1,Ridge Regression,19003.848857,890544800.0,29841.996622
2,Lasso Regression,17921.055465,797537200.0,28240.700495
3,Support Vector Regressor,59514.653064,7854451000.0,88625.341284
4,Decision Tree Regressor,28655.113014,1892541000.0,43503.345674
5,Random Forest Regressor,17408.137192,808385000.0,28432.111903
6,Gradient Boosting Regressor,16622.38125,688705600.0,26243.201919
7,AdaBoost Regressor,24349.642759,1170444000.0,34211.755979
8,KNeighbors Regressor,22604.310959,1499188000.0,38719.352954


In [None]:
results_df.sort_values(by='RSME', inplace=True)
results_df.reset_index(drop=True, inplace=True)

In [None]:
results_df

Unnamed: 0,Model,MAE,MSE,RSME
0,Gradient Boosting Regressor,16622.38125,688705600.0,26243.201919
1,Lasso Regression,17921.055465,797537200.0,28240.700495
2,Random Forest Regressor,17408.137192,808385000.0,28432.111903
3,Linear Regression,18287.881674,868713900.0,29473.95265
4,Ridge Regression,19003.848857,890544800.0,29841.996622
5,AdaBoost Regressor,24349.642759,1170444000.0,34211.755979
6,KNeighbors Regressor,22604.310959,1499188000.0,38719.352954
7,Decision Tree Regressor,28655.113014,1892541000.0,43503.345674
8,Support Vector Regressor,59514.653064,7854451000.0,88625.341284


*Here we can see that Gradient Boosting Regression have the lowest Error score from other algorithms . So we will select Gradient Boosting Regressor model for the prediction*

In [None]:
y_test

892     154500
1105    325000
413     115000
522     159000
1036    315500
         ...  
479      89471
1361    260000
802     189000
651     108000
722     124500
Name: SalePrice, Length: 292, dtype: int64

In [None]:
model=GradientBoostingRegressor()

pipeline = Pipeline(steps=[
        ('preprocessor', preprocesing),
        ('regressor', model)
    ])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

In [None]:
y_pred

array([145103.18075161, 340632.99655754, 120814.71254643, 150775.19405581,
       339124.90846162,  82869.69172067, 220202.10207098, 138662.13871738,
        82869.69172067, 135697.04788611, 158770.46827569, 122845.42250347,
       108834.65881095, 199412.48110379, 171024.42645538, 135700.32229078,
       193014.16716174, 142148.58670623, 117591.22063084, 208783.0519867 ,
       165054.27759555, 222168.4186358 , 169592.71839091, 125013.09727515,
       197398.11267153, 162498.52697567, 194762.05970023, 110856.09498119,
       176535.20245211, 200515.49462887, 115856.99532662, 245576.54986728,
       246843.03076488, 115700.71645282, 249430.53802887, 149600.80041011,
       130713.53375339, 207894.21462217, 321681.40804986, 107859.35577736,
       125197.91202731, 239680.06232473, 117341.47329963, 380728.22094846,
       126393.55983597, 132445.39981636, 113787.92159146, 128081.2674556 ,
       425338.04602831, 130420.91533306, 120119.3269361 , 202092.57316242,
       115165.042775  , 3

In [None]:
y_test[802]

189000

In [None]:
y_pred[-3]

183866.79656596162