In [7]:
import math

import numpy as np
import pandas as pd
import seaborn as sns

from collections import Counter

# Linear Models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import Lars
from sklearn.linear_model import LassoLars
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import TheilSenRegressor

# Support Vector Machine
from sklearn.svm import SVR

# Decision Trees and Ensemble Methods
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

# Nearest Neighbors
from sklearn.neighbors import KNeighborsRegressor

# Gaussian Processes
from sklearn.gaussian_process import GaussianProcessRegressor

# Neural Networks
from sklearn.neural_network import MLPRegressor

# Kernel Ridge Regression
from sklearn.kernel_ridge import KernelRidge

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

data_path = r'C:\Users\ShashankPC\Downloads\HousePricesART\train.csv'
dataDF = pd.read_csv(data_path)

In [11]:
# sns.histplot(data=dataDF['MSZoning'])
# sns.histplot(data=dataDF['MSSubClass'])
# sns.histplot(data=dataDF['LotFrontage'])
# sns.histplot(data=dataDF['LotArea'])
# sns.histplot(data=dataDF['Street'])
# sns.histplot(data=dataDF['Alley'])
# sns.histplot(data=dataDF['LotShape'])
# sns.histplot(data=dataDF['LandContour'])
# sns.histplot(data=dataDF['Utilities'])
# sns.histplot(data=dataDF['LotConfig'])

In [58]:
dataDF.select_dtypes(include=np.number).shape, dataDF.select_dtypes(include=object).shape

dataDF_dtnumber = dataDF.select_dtypes(include=np.number)
dataDF_dtobject = dataDF.select_dtypes(include=object)
dataDF_dtnumber.shape, dataDF_dtobject.shape

dataDF_dtobject.isna().sum()

dataDF_dtobject.LotShape.value_counts()

label_encoder_LotShape = LabelEncoder()

LotShape_labelencoded = label_encoder_LotShape.fit(dataDF_dtobject.LotShape).transform(dataDF_dtobject.LotShape)
pd.Series(LotShape_labelencoded).value_counts()

Counter(LotShape_labelencoded)

dataDF_dtobject_labelencoded = dataDF_dtobject.apply(lambda x: LabelEncoder().fit_transform(x))
dataDF_dtobject_labelencoded.shape

((1460, 38), (1460, 43))

In [59]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Support Vector Regressor": SVR(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Gradient Boosting Regressor" : GradientBoostingRegressor()
}

# Function to evaluate the model
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name}:\n\tMean Squared Error: {mse}\n\tR^2 Score: {r2}\n")


In [205]:
# pd.set_option('display.max_rows', None)
# dataDF.is(0).sum()
# Counter(dataDF['LotArea']).is(0)
dataDF1 = dataDF[[
    'LotArea'
    ,'LotFrontage'
    , 'MoSold'
    , 'YrSold'
    ,'TotalBsmtSF'
    , 'MasVnrArea'
    ,'GrLivArea'
    ,'BsmtFullBath'
    ,'BsmtHalfBath'
    ,'FullBath'
    ,'HalfBath'
    ,'BedroomAbvGr'
    ,'KitchenAbvGr'
    ,'TotRmsAbvGrd'
    ,'Fireplaces'
    ,'GarageArea'
    ,'GarageCars'
    ,'GarageYrBlt'
    ,'ScreenPorch'
    ,'3SsnPorch'
    ,'EnclosedPorch'
    ,'OpenPorchSF'
    ,'WoodDeckSF'
    # ,'PoolArea'  decrese accuresy
    # ,'MiscVal'  
    ,'SalePrice']].copy()
dataDF1 = dataDF1.dropna( axis=0)
dataDF1.shape

(1121, 24)

In [218]:
X = dataDF1.drop('SalePrice', axis=1)
y = dataDF1['SalePrice']
y = y.apply(lambda x: (math.log(x)))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [261]:
dataDF_dtnumber = dataDF.select_dtypes(include=np.number)
dataDF_dtobject = dataDF.select_dtypes(include=object)
label_encoder_LotShape = LabelEncoder()
dataDF_dtobject_labelencoded = dataDF_dtobject.apply(lambda x: LabelEncoder().fit_transform(x))
dataDF_dtobject_labelencoded.shape


X = dataDF_dtobject_labelencoded
y = dataDF_dtnumber['SalePrice']
y = y.apply(lambda x: (math.log(x)))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for name, model in models.items():
    evaluate_model(name, model, X_train, X_test, y_train, y_test)

Linear Regression:
	Mean Squared Error: 0.05739605903337286
	R^2 Score: 0.6924335408695917

Ridge Regression:
	Mean Squared Error: 0.05742689747575356
	R^2 Score: 0.6922682878768277

Lasso Regression:
	Mean Squared Error: 0.18770284240303775
	R^2 Score: -0.005837327142055315

Decision Tree:
	Mean Squared Error: 0.07168351639520938
	R^2 Score: 0.6158717917745596

Random Forest:
	Mean Squared Error: 0.044440310681539676
	R^2 Score: 0.7618591027124533

Support Vector Regressor:
	Mean Squared Error: 0.05111066175152945
	R^2 Score: 0.7261149019031155

K-Neighbors Regressor:
	Mean Squared Error: 0.0573196411298784
	R^2 Score: 0.6928430390892917

Gradient Boosting Regressor:
	Mean Squared Error: 0.03939839177854272
	R^2 Score: 0.7888770752062751



In [266]:
dataDF_dtnumber = dataDF.select_dtypes(include=np.number)
dataDF_dtobject = dataDF.select_dtypes(include=object)

label_encoder_LotShape = LabelEncoder()
dataDF_dtobject_labelencoded = dataDF_dtobject.apply(lambda x: LabelEncoder().fit_transform(x))
dataDF_dtobject_labelencoded.shape

(1460, 43)

In [269]:
from sklearn.preprocessing import OneHotEncoder

dataDF_dtobject_ohc_encoded = OneHotEncoder().fit_transform(dataDF_dtobject_labelencoded)
dataDF_dtobject_ohc_encoded.shape

(1460, 267)

In [271]:
dataDF_dtobject_ohc_encoded_dense = dataDF_dtobject_ohc_encoded.todense()
dataDF_dtobject_ohc_encoded_dense

In [272]:
X = dataDF_dtobject_ohc_encoded_dense
y = dataDF_dtnumber['SalePrice']
y = y.apply(lambda x: (math.log(x)))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for name, model in models.items():
    evaluate_model(name, model, X_train, X_test, y_train, y_test)

TypeError: np.matrix is not supported. Please convert to a numpy array with np.asarray. For more information see: https://numpy.org/doc/stable/reference/generated/numpy.matrix.html

In [219]:
for name, model in models.items():
    evaluate_model(name, model, X_train, X_test, y_train, y_test)

Linear Regression:
	Mean Squared Error: 0.024147876979174258
	R^2 Score: 0.8301641508417135

Ridge Regression:
	Mean Squared Error: 0.024045794174516168
	R^2 Score: 0.8308821153993646

Lasso Regression:
	Mean Squared Error: 0.031086169858088788
	R^2 Score: 0.7813660364643875

Decision Tree:
	Mean Squared Error: 0.05546806820966775
	R^2 Score: 0.6098842778732421

Random Forest:
	Mean Squared Error: 0.023365632589751408
	R^2 Score: 0.8356657997130206

Support Vector Regressor:
	Mean Squared Error: 0.03878719552659932
	R^2 Score: 0.727203501392289

K-Neighbors Regressor:
	Mean Squared Error: 0.048015833802997905
	R^2 Score: 0.6622970245372617

Gradient Boosting Regressor:
	Mean Squared Error: 0.019657263707398388
	R^2 Score: 0.8617473462883057



In [265]:
dataDF2 = dataDF[[
    'LotArea'
]]
X = dataDF2
y = y.apply(lambda x: (math.log(x)))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
for name, model in models.items():
    evaluate_model(name, model, X_train, X_test, y_train, y_test)

Linear Regression:
	Mean Squared Error: 0.0012221668189180404
	R^2 Score: 0.05659750539301056

Ridge Regression:
	Mean Squared Error: 0.0012221668189184903
	R^2 Score: 0.056597505392663394

Lasso Regression:
	Mean Squared Error: 0.0012228486282792486
	R^2 Score: 0.05607121009333982

Decision Tree:
	Mean Squared Error: 0.0014007378468962675
	R^2 Score: -0.08124321377193278

Random Forest:
	Mean Squared Error: 0.001154721100600548
	R^2 Score: 0.10865951356273962

Support Vector Regressor:
	Mean Squared Error: 0.0014536608697003654
	R^2 Score: -0.12209501154838409

K-Neighbors Regressor:
	Mean Squared Error: 0.0010905767951463812
	R^2 Score: 0.15817312892489233

Gradient Boosting Regressor:
	Mean Squared Error: 0.0010149478019312167
	R^2 Score: 0.21655188684843396



In [263]:
dataDF2 = dataDF[[
    'LotArea'
]]
X = dataDF2
y = y.apply(lambda x: (math.log(x)))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
for name, model in models.items():
    evaluate_model(name, model, X_train, X_test, y_train, y_test)

0       12.247694
1       12.109011
2       12.317167
3       11.849398
4       12.429216
5       11.870600
6       12.634603
7       12.206073
8       11.774520
9       11.678440
10      11.771436
11      12.751300
12      11.877569
13      12.540758
14      11.964001
15      11.790557
16      11.911702
17      11.407565
18      11.976659
19      11.842229
20      12.692503
21      11.845103
22      12.345835
23      11.774520
24      11.944708
25      12.454104
26      11.811547
27      12.631340
28      12.242887
29      11.134589
30      10.596635
31      11.914048
32      12.100156
33      12.016726
34      12.533576
35      12.641097
36      11.884489
37      11.938193
38      11.599103
39      11.314475
40      11.982929
41      12.043554
42      11.877569
43      11.777211
44      11.856515
45      12.675764
46      12.387085
47      12.428015
48      11.635143
49      11.751942
50      12.083905
51      11.648330
52      11.608236
53      12.860999
54      11.775290
55      12