In [27]:
# Modern approach using ColumnTransformer for proper preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Define which columns are categorical and numerical
categorical_features = ['town']
numerical_features = ['area']

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)  # drop='first' to avoid multicollinearity
    ])

# Prepare the data - use original dataframe before label encoding
df_original = pd.read_csv('homeprices.csv')
df_original


Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [28]:
# Transform the features
X_transformed = preprocessor.fit_transform(X_original)
print("Shape after transformation:", X_transformed.shape)
print("\nTransformed features (first 5 rows):")
print(X_transformed[:5])

# Get feature names after transformation
feature_names = (preprocessor.named_transformers_['num'].get_feature_names_out(numerical_features).tolist() +
                preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features).tolist())
print("\nFeature names after transformation:")
print(feature_names)

In [32]:
# Train the linear regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_transformed, y)
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64