<a href="https://colab.research.google.com/github/ramyars466/prodigy-ml-task1-house-prices/blob/main/house_price_prediction_task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
# Load the Kaggle House Prices data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print("Train shape:", train.shape)
print("Test shape :", test.shape)

train.head()


Train shape: (1460, 81)
Test shape : (1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
# Select features and target for the linear regression model

# Features:
#  - GrLivArea: above-ground living area (square feet)
#  - BedroomAbvGr: number of bedrooms above ground
#  - FullBath: number of full bathrooms above ground
feature_cols = ["GrLivArea", "BedroomAbvGr", "FullBath"]

# Target:
#  - SalePrice: house sale price
target_col = "SalePrice"

# Keep only these columns from the training data
df = train[feature_cols + [target_col]].copy()

# Check for missing values
print(df.isnull().sum())

# Show first few rows
df.head()


GrLivArea       0
BedroomAbvGr    0
FullBath        0
SalePrice       0
dtype: int64


Unnamed: 0,GrLivArea,BedroomAbvGr,FullBath,SalePrice
0,1710,3,2,208500
1,1262,3,2,181500
2,1786,3,2,223500
3,1717,3,1,140000
4,2198,4,2,250000


In [None]:
# Fill any missing numeric feature values with the median of that column
for col in feature_cols:
    df[col] = df[col].fillna(df[col].median())

# Verify again
print(df.isnull().sum())


GrLivArea       0
BedroomAbvGr    0
FullBath        0
SalePrice       0
dtype: int64


In [None]:
# Separate features (X) and target (y)
X = df[feature_cols]
y = df[target_col]

# Split data: 80% for training, 20% for validation
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_valid.shape


((1168, 3), (292, 3))

In [None]:
# Create and train the Linear Regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)


In [None]:
# Predict on validation data
y_pred = lin_reg.predict(X_valid)

# Calculate evaluation metrics
mse = mean_squared_error(y_valid, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_valid, y_pred)

print(f"Mean Squared Error  : {mse:.2f}")
print(f"Root MSE            : {rmse:.2f}")
print(f"R-squared (R2 score): {r2:.4f}")


Mean Squared Error  : 2806426667.25
Root MSE            : 52975.72
R-squared (R2 score): 0.6341


In [None]:
print("Intercept:", lin_reg.intercept_)
for name, coef in zip(feature_cols, lin_reg.coef_):
    print(f"Coefficient for {name}: {coef:.2f}")


Intercept: 52261.74862694461
Coefficient for GrLivArea: 104.03
Coefficient for BedroomAbvGr: -26655.17
Coefficient for FullBath: 30014.32


In [None]:
# Use all cleaned data for final training
X_full = df[feature_cols]   # df already has missing values handled
y_full = df[target_col]

final_model = LinearRegression()
final_model.fit(X_full, y_full)


In [None]:
# Select the same features from test.csv
test_features = test[feature_cols].copy()

# Handle missing values in test features (use train medians)
for col in feature_cols:
    test_features[col] = test_features[col].fillna(df[col].median())

test_features.head()


Unnamed: 0,GrLivArea,BedroomAbvGr,FullBath
0,896,2,1
1,1329,3,1
2,1629,3,2
3,1604,3,2
4,1280,2,2


In [None]:
# Predict SalePrice for each row in test.csv
test_preds = final_model.predict(test_features)

len(test_preds), test.shape[0]


(1459, 1459)

In [None]:
import pandas as pd

# If you already have test DataFrame loaded as `test`
submission = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": test_preds
})
submission.head()


Unnamed: 0,Id,SalePrice
0,1461,120100.812977
1,1462,139898.208279
2,1463,202611.414586
3,1464,199859.871426
4,1465,192059.2043


In [None]:
submission.to_csv("submission.csv", index=False)
