In [1]:
import pandas as pd
df=pd.read_csv('Sample_Home_Prices.csv')
df

Unnamed: 0,town,area,price
0,Pune,2600,550000
1,Pune,3000,565000
2,Pune,3200,610000
3,Pune,3600,680000
4,Pune,4000,725000
5,Mumbai,2600,585000
6,Mumbai,2800,615000
7,Mumbai,3300,650000
8,Mumbai,3600,710000
9,Banglore,2600,575000


In [2]:
# get_dummies from pandas library can be used to convert categorical variable into dummy variables.
## can also use drop_first=True option to get rid of dummy trap situation
## dummies = pd.get_dummies(df.town,drop_first=True)
dummies=pd.get_dummies(df.town)
dummies

Unnamed: 0,Banglore,Mumbai,Pune
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
5,0,1,0
6,0,1,0
7,0,1,0
8,0,1,0
9,1,0,0


In [3]:
## concatenating the both dataframes.
merged=pd.concat([df,dummies],axis=1)
merged

Unnamed: 0,town,area,price,Banglore,Mumbai,Pune
0,Pune,2600,550000,0,0,1
1,Pune,3000,565000,0,0,1
2,Pune,3200,610000,0,0,1
3,Pune,3600,680000,0,0,1
4,Pune,4000,725000,0,0,1
5,Mumbai,2600,585000,0,1,0
6,Mumbai,2800,615000,0,1,0
7,Mumbai,3300,650000,0,1,0
8,Mumbai,3600,710000,0,1,0
9,Banglore,2600,575000,1,0,0


In [4]:
## dropping the town column
final=merged.drop(['town'],axis=1)
final

Unnamed: 0,area,price,Banglore,Mumbai,Pune
0,2600,550000,0,0,1
1,3000,565000,0,0,1
2,3200,610000,0,0,1
3,3600,680000,0,0,1
4,4000,725000,0,0,1
5,2600,585000,0,1,0
6,2800,615000,0,1,0
7,3300,650000,0,1,0
8,3600,710000,0,1,0
9,2600,575000,1,0,0


In [5]:
## One can drop any of the column from Banagalore, Pune or Mumbai to get rid of dummy trap or can also use the drop_first=True
final=final.drop(['Mumbai'],axis=1)
final

Unnamed: 0,area,price,Banglore,Pune
0,2600,550000,0,1
1,3000,565000,0,1
2,3200,610000,0,1
3,3600,680000,0,1
4,4000,725000,0,1
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,1,0


In [6]:
## preparing input variables
X=final.drop(['price'],axis=1)
X

Unnamed: 0,area,Banglore,Pune
0,2600,0,1
1,3000,0,1
2,3200,0,1
3,3600,0,1
4,4000,0,1
5,2600,0,0
6,2800,0,0
7,3300,0,0
8,3600,0,0
9,2600,1,0


In [7]:
## target variable
y=final.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [8]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [9]:
#3400 sqft home in Mumbai city
model.predict([[3400,0,0]])

array([681241.66845839])

In [10]:
#2800 sqft home in Pune City
model.predict([[2800,0,1]])

array([565089.22812299])

In [11]:
model.score(X,y)

0.9573929037221873

# Another Method

In [12]:
## Using label encoder
## It encodes target labels with value between 0 and n_classes-1.
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [13]:
## creating a copy of dataframe
dfle=df
dfle

Unnamed: 0,town,area,price
0,Pune,2600,550000
1,Pune,3000,565000
2,Pune,3200,610000
3,Pune,3600,680000
4,Pune,4000,725000
5,Mumbai,2600,585000
6,Mumbai,2800,615000
7,Mumbai,3300,650000
8,Mumbai,3600,710000
9,Banglore,2600,575000


In [14]:
## applying fit_transform function from label encoder
## It fits label encoder and return encoded labels
dfle.town=le.fit_transform(dfle.town)
dfle

Unnamed: 0,town,area,price
0,2,2600,550000
1,2,3000,565000
2,2,3200,610000
3,2,3600,680000
4,2,4000,725000
5,1,2600,585000
6,1,2800,615000
7,1,3300,650000
8,1,3600,710000
9,0,2600,575000


In [15]:
#X=dfle[['town','area']].values
#X
X = dfle.drop(['price'],axis=1)
X

Unnamed: 0,town,area
0,2,2600
1,2,3000
2,2,3200
3,2,3600
4,2,4000
5,1,2600
6,1,2800
7,1,3300
8,1,3600
9,0,2600


In [16]:
#y=dfle.price.values
y = dfle.price

In [17]:
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [18]:
## Using one hot encoder after converting using label encoder
from sklearn.preprocessing import OneHotEncoder
ohe=OneHotEncoder()
from sklearn.compose import ColumnTransformer
ct=ColumnTransformer([('town',ohe,[0])],remainder="passthrough")

In [19]:
X=ct.fit_transform(X)
X

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03]])

In [20]:
model.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [21]:
#3400 sqft home in Mumbai city
model.predict([[0,1,0,3400]])

array([681241.6684584])

In [22]:
#2800 sqft home in Pune City
model.predict([[0,0,1,2800]])

array([565089.22812299])

In [23]:
model.score(X,y)

0.9573929037221875

In [24]:
X=X[:,1:]
X

array([[0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 3.0e+03],
       [0.0e+00, 1.0e+00, 3.2e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 4.0e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.8e+03],
       [1.0e+00, 0.0e+00, 3.3e+03],
       [1.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 0.0e+00, 3.6e+03]])

In [25]:
model.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [26]:
model.predict([[1,0,3400]])

array([681241.66845839])

In [27]:
model.score(X,y)

0.9573929037221871

## Exploring more

In [28]:
import pandas as pd
ids = [11, 22, 33, 44, 55, 66, 77]
countries = ['Spain', 'France', 'Spain', 'Germany', 'France']
df = pd.DataFrame(list(zip(ids, countries)),columns=['Ids', 'Countries'])
df

Unnamed: 0,Ids,Countries
0,11,Spain
1,22,France
2,33,Spain
3,44,Germany
4,55,France


In [29]:
from sklearn.preprocessing import LabelBinarizer
y = LabelBinarizer().fit_transform(df.Countries)
y

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [1, 0, 0]])