In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Load the dataset
df = pd.read_csv('kc_house_data.csv')

# Define the features and target variable
features = [
    "floors",
    "waterfront",
    "lat",
    "bedrooms",
    "sqft_basement",
    "view",
    "bathrooms",
    "sqft_living15",
    "sqft_above",
    "grade",
    "sqft_living"
]
X = df[features]  # Features
y = df['price']   # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline object
pipeline = Pipeline([
    ('scaler', StandardScaler()),           # Step 1: Scale the data
    ('poly_features', PolynomialFeatures(degree=2)),  # Step 2: Polynomial transformation
    ('model', LinearRegression())           # Step 3: Fit a linear regression model
])

# Fit the pipeline using the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Calculate R² value
r2 = r2_score(y_test, y_pred)

# Print the R² value
print(f"R² value: {r2:.4f}")


R² value: 0.7118
