In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from word2number import w2n

In [2]:
df = pd.read_csv('hiring.csv')
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


## Data Preprocessing

### Filling expericience as zero for NaN

In [3]:
df.experience = df.experience.fillna('zero')
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,zero,8.0,9,50000
1,zero,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


### Converting experience from word to numeric form

In [4]:
exp = df.experience.values
for i in range(len(exp)):
    exp[i] = w2n.word_to_num(exp[i])

In [5]:
exp

array([0, 0, 5, 2, 7, 3, 10, 11], dtype=object)

In [6]:
df.experience = exp

In [7]:
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,,7,72000
7,11,7.0,8,80000


### Filling out missing entry for test_score via median 

In [8]:
import math
test_score = df['test_score(out of 10)']
test_score_mean = math.floor(test_score.mean())
test_score_mean

7

In [9]:
df['test_score(out of 10)'] = df['test_score(out of 10)'].fillna(test_score_mean)
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,7.0,7,72000
7,11,7.0,8,80000


### Fit the data in Linear Regression Model

x includes input data - experience, test_score, interview_score

y includes output data - salary

In [10]:
x = df.iloc[:, 0:-1].values
y = df.iloc[:, -1].values
x, y

(array([[0, 8.0, 9],
        [0, 8.0, 6],
        [5, 6.0, 7],
        [2, 10.0, 10],
        [7, 9.0, 6],
        [3, 7.0, 10],
        [10, 7.0, 7],
        [11, 7.0, 8]], dtype=object),
 array([50000, 45000, 60000, 65000, 70000, 62000, 72000, 80000],
       dtype=int64))

In [11]:
reg = LinearRegression()
reg.fit(x, y)

LinearRegression()

In [12]:
reg.predict([[2, 9, 6]])

array([53713.86677124])

In [13]:
reg.predict([[12, 10, 10]])

array([93747.79628651])

In [14]:
reg.coef_

array([2922.26901502, 2221.30909959, 2147.48256637])

In [15]:
reg.intercept_

14992.651446693126