In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load data
df = pd.read_csv("homeprice.csv")

# Transform 'town' using one-hot encoding
column_transformer = ColumnTransformer(
    transformers=[
        ('town', OneHotEncoder(), ['town'])
    ],
    remainder='passthrough'
)

# Transform the features
X = column_transformer.fit_transform(df[['town', 'area']])
y = df['price']

# Fit the model
model = LinearRegression()
model.fit(X, y)

# Predict
predictions = model.predict(np.array([[0, 1, 0, 2800], [0, 0, 1, 3400]]))
print(predictions)

# Check the model score
score = model.score(X, y)
print(score)

[590775.63964739 681241.6684584 ]
0.9573929037221871


In [4]:
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [6]:
# Highlight: Get the feature names for the one-hot encoded columns
feature_names = column_transformer.named_transformers_['town'].get_feature_names_out(['town'])
# Add the 'area' column name
feature_names = np.append(feature_names, 'area')
feature_names


array(['town_monroe township', 'town_robinsville', 'town_west windsor',
       'area'], dtype=object)