In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 1. Load the data
data = pd.read_csv("HousePricePrediction.csv")
prediction_data = data.copy()

# 2. Map categorical columns in prediction_data BEFORE defining X
categorical_cols = prediction_data.select_dtypes(include=['object']).columns

for col in categorical_cols:
    unique_vals = prediction_data[col].dropna().unique()
    mapping = {val: i + 1 for i, val in enumerate(unique_vals)}
    # This line updates the column with numbers
    prediction_data[col] = prediction_data[col].map(mapping)
    # Fill missing values with 0 so the model doesn't error
    prediction_data[col] = prediction_data[col].fillna(0)

# 3. Handle numeric NaNs (like in SalePrice or BsmtFinSF2)
# Regression models cannot handle NaN values
prediction_data = prediction_data.fillna(0)

# 4. NOW define X and y from the updated prediction_data
X = prediction_data.drop(columns=["SalePrice"])
y = prediction_data["SalePrice"]

# 5. Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Verify
print(X_train.head())

        Id  MSSubClass  MSZoning  LotArea  LotConfig  BldgType  OverallCond  \
2650  2650         160       1.0     2179          1         5            5   
1709  1709          20       4.0     7500          1         1            5   
869    869          60       1.0     9938          1         1            5   
2355  2355          20       1.0    12450          1         1            5   
2914  2914         160       2.0     1936          1         5            7   

      YearBuilt  YearRemodAdd  Exterior1st  BsmtFinSF2  TotalBsmtSF  
2650       1976          1976          8.0         0.0        855.0  
1709       2006          2007          1.0         0.0       1372.0  
869        1993          1994          2.0         0.0       1050.0  
2355       2003          2004          1.0         0.0       1094.0  
2914       1970          1970          7.0         0.0        546.0  


In [2]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train, y_train)
y_pred=model.predict(X_test)
y_pred

array([-7.72626748e+03,  2.15863009e+03, -9.72548758e+03,  1.65991989e+05,
        8.43342137e+04,  2.48579495e+05,  3.97892005e+04,  1.40301054e+05,
       -5.67482249e+04,  4.76633945e+04,  7.61966204e+04,  4.11773300e+03,
       -4.20962840e+04,  7.78554615e+04,  1.47060940e+05,  2.07463784e+05,
        1.42567431e+05,  1.28679755e+05,  1.34191892e+05,  4.04442829e+04,
        1.96008914e+05,  1.29036629e+05,  3.81757972e+04,  1.14764318e+05,
       -8.89416063e+03,  2.29683547e+04,  2.07954946e+05,  3.56238059e+04,
        6.29058812e+04,  1.07976264e+05,  9.51168321e+04, -3.05774451e+04,
        1.02814934e+05,  1.28807834e+05,  7.10399082e+04,  1.01280147e+05,
        1.00732927e+05,  3.71246327e+04,  3.99036450e+04,  1.89093907e+05,
       -7.76399871e+04, -2.61046572e+04,  2.50346908e+05, -1.91927349e+03,
        1.82546562e+05,  1.21726675e+04,  1.08480820e+05,  1.57397200e+05,
        1.96512065e+05,  2.40186985e+05,  3.29831729e+04,  1.92847907e+05,
        1.86444108e+05,  

In [3]:
from sklearn.metrics import r2_score
r2_score=r2_score(y_test, y_pred)
r2_score

0.6415634825204557