In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
# Load the training data
train_df = pd.read_csv('train.csv')


In [3]:
# Select the relevant features
X = train_df[['GrLivArea', 'BedroomAbvGr', 'FullBath']]
y = train_df['SalePrice']

In [5]:
train_df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [6]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Create and train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [8]:
# Make predictions on the validation set
y_pred_val = model.predict(X_val)

In [9]:
# Evaluate the model
mse_val = mean_squared_error(y_val, y_pred_val)
r2_val = r2_score(y_val, y_pred_val)
print(f'Validation Mean Squared Error: {mse_val:.2f}')
print(f'Validation R-squared: {r2_val:.2f}')


Validation Mean Squared Error: 2806426667.25
Validation R-squared: 0.63


In [10]:
# Load the testing data
test_df = pd.read_csv('test.csv')
X_test = test_df[['GrLivArea', 'BedroomAbvGr', 'FullBath']]


In [11]:
# Make predictions on the testing set
y_pred_test = model.predict(X_test)


In [12]:
# Save the predictions to a submission file
submission_df = pd.DataFrame({'Id': test_df['Id'], 'SalePrice': y_pred_test})
submission_df.to_csv('submission.csv', index=False)

In [13]:
# Use the model to make predictions on new data
def predict_house_price(sqft, bedrooms, bathrooms):
    X_new = pd.DataFrame({'GrLivArea': [sqft], 'BedroomAbvGr': [bedrooms], 'FullBath': [bathrooms]})
    return model.predict(X_new)[0]


In [14]:
#  predict the prices of houses based on their square footage and the number of bedrooms and bathrooms:
print(predict_house_price(2000, 3, 2))

240377.51479736282
