In [5]:
import pandas as pd 
df = pd.read_csv("homeprices3.csv")
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [9]:
# Create Dummy Var Columns
# Dummy variables are used to represent categorical data as numerical values

dummies = pd.get_dummies(df.town)
dummies

Unnamed: 0,monroe township,robinsville,west windsor
0,True,False,False
1,True,False,False
2,True,False,False
3,True,False,False
4,True,False,False
5,False,False,True
6,False,False,True
7,False,False,True
8,False,False,True
9,False,True,False


In [15]:
# Merge two dataFrame using concat
merged = pd.concat([df,dummies],axis="columns")
merged

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,True,False,False
1,monroe township,3000,565000,True,False,False
2,monroe township,3200,610000,True,False,False
3,monroe township,3600,680000,True,False,False
4,monroe township,4000,725000,True,False,False
5,west windsor,2600,585000,False,False,True
6,west windsor,2800,615000,False,False,True
7,west windsor,3300,650000,False,False,True
8,west windsor,3600,710000,False,False,True
9,robinsville,2600,575000,False,True,False


In [21]:
# Dropping dummy variables helps to avoid multicollinearity, reduce redundancy, improve model interpretability, and avoid potential issues such as the dummy variable trap

# To avoid the dummy variable trap, you drop one dummy variable from each category to prevent over-parameterization and inaccurate results
final = merged.drop(["town","west windsor"],axis="columns")
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,True,False
1,3000,565000,True,False
2,3200,610000,True,False
3,3600,680000,True,False
4,4000,725000,True,False
5,2600,585000,False,False
6,2800,615000,False,False
7,3300,650000,False,False
8,3600,710000,False,False
9,2600,575000,False,True


In [23]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [25]:
X = final.drop("price",axis="columns")
X

Unnamed: 0,area,monroe township,robinsville
0,2600,True,False
1,3000,True,False
2,3200,True,False
3,3600,True,False
4,4000,True,False
5,2600,False,False
6,2800,False,False
7,3300,False,False
8,3600,False,False
9,2600,False,True


In [29]:
Y = final.price
Y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [51]:
model.fit(X.values,Y.values)

In [53]:
# model.predict([[Sqft, manroe township, robinsville]])
model.predict([[2800,0,1]])

array([590775.63964739])

In [55]:
model.predict([[3400,0,0]])

array([681241.66845839])

In [59]:
# To calculate how our model is acuurate
model.score(X.values,Y.values)


0.9573929037221871

In [61]:
# Label encoding : convert the names into index 
from sklearn.preprocessing import LabelEncoder
# Create object of it 
le = LabelEncoder()


In [77]:
dfle = df 
dfle.town = le.fit_transform(dfle.town)
dfle 

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [79]:
X = dfle[["town","area"]].values
X

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]], dtype=int64)

In [81]:
Y = dfle.price
Y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [101]:
# One Hot Encoder : One hot encoding (OHE) is a machine learning technique that encodes categorical data to numerical ones
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder().categorical_features =[0]

In [107]:
X = ohe.

AttributeError: 'list' object has no attribute 'fit_transform'

In [105]:
X = X[:,1:]
X

array([[2600],
       [3000],
       [3200],
       [3600],
       [4000],
       [2600],
       [2800],
       [3300],
       [3600],
       [2600],
       [2900],
       [3100],
       [3600]], dtype=int64)