# Linear regression with multiple variables

In [1]:
# pip install word2number

In [2]:
import pandas as pd
import numpy as np
from sklearn import linear_model
import math
from word2number import w2n

In [3]:
df = pd.read_csv('homeprices.csv')
df = df.head()
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000


In [4]:
# data preprocessing :- handling NA values
median_bedrooms = math.floor(df.bedrooms.median())
median_bedrooms

3

In [5]:
df.bedrooms = df.bedrooms.fillna(median_bedrooms)
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,3.0,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000


In [6]:
reg = linear_model.LinearRegression()
reg.fit(df[['area', 'bedrooms', 'age']], df.price)

In [7]:
reg.coef_

array([   137.25, -26025.  ,  -6825.  ])

In [8]:
reg.intercept_

383725.0

In [9]:
# By using these given data, find out price of a home that has :-
# 1. 3000 sqr ft area, 3 bedrooms, 40 year old
# 2. 2500 sqr ft area, 4 bedrooms, 5 year old

In [10]:
reg.predict([[3000, 3, 40]])



array([444400.])

In [11]:
res1 = 137.25*3000 + (-26025.)*3 + (-6825.)*40 + 383725.0
res1

444400.0

In [12]:
reg.predict([[2500, 4, 5]])



array([588625.])

In [13]:
res2 = 137.25*2500 + (-26025.)*4 + (-6825.)*5 + 383725.0
res2

588625.0

## Taking another CSV file

In [14]:
df1 = pd.read_csv('hiring.csv')
df1

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [15]:
# Data preprocessing
df1.experience = df1.experience.fillna('zero')

median_test_score = math.floor(df1[['test_score(out of 10)']].median())
df1[['test_score(out of 10)']] = df1[['test_score(out of 10)']].fillna(median_test_score)

In [16]:
df1

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,zero,8.0,9,50000
1,zero,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,8.0,7,72000
7,eleven,7.0,8,80000


In [17]:
df1.experience = df1.experience.apply(w2n.word_to_num)
df1

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,8.0,7,72000
7,11,7.0,8,80000


In [18]:
reg = linear_model.LinearRegression()
reg.fit(df1[['experience', 'test_score(out of 10)', 'interview_score(out of 10)']], df1[['salary($)']])

In [19]:
reg.coef_

array([[2812.95487627, 1845.70596798, 2205.24017467]])

In [20]:
reg.intercept_

array([17737.26346434])

In [21]:
# Predictions to be made for:-
# 2 yr experience, 9 test score, 6 interview score
# 12 yr experience, 10 test score, 10 interview score

In [22]:
reg.predict([[2, 9, 6]])



array([[53205.96797671]])

In [23]:
2812.95487627*2 + 1845.70596798*9 + 2205.24017467*6 + 17737.26346434

53205.96797672001

In [24]:
reg.predict([[12, 10, 10]])



array([[92002.18340611]])

In [25]:
2812.95487627*12 + 1845.70596798*10 + 2205.24017467*10 + 17737.26346434

92002.18340608