In [2]:
import pandas as pd
from sklearn import linear_model
import numpy as np
import math

In [3]:
df = pd.read_csv('/content/sample_data/hiring.csv')
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [4]:
missing_test_score = math.floor(df['test_score(out of 10)'].median())
missing_test_score

8

In [5]:
df['test_score(out of 10)'] = df['test_score(out of 10)'].fillna(missing_test_score)
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,8.0,7,72000
7,eleven,7.0,8,80000


In [6]:
df.experience = df.experience.fillna("zero")
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,zero,8.0,9,50000
1,zero,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,8.0,7,72000
7,eleven,7.0,8,80000


In [7]:
pip install word2number

Collecting word2number
  Downloading https://files.pythonhosted.org/packages/4a/29/a31940c848521f0725f0df6b25dca8917f13a2025b0e8fcbe5d0457e45e6/word2number-1.1.zip
Building wheels for collected packages: word2number
  Building wheel for word2number (setup.py) ... [?25l[?25hdone
  Created wheel for word2number: filename=word2number-1.1-cp36-none-any.whl size=5588 sha256=06dfd6d116a815b4cdca0a2b204bdcda8b2987125d725834700f13ed9b90292c
  Stored in directory: /root/.cache/pip/wheels/46/2f/53/5f5c1d275492f2fce1cdab9a9bb12d49286dead829a4078e0e
Successfully built word2number
Installing collected packages: word2number
Successfully installed word2number-1.1


Now, I will use word to number library to convert experience into numeric value so that it could be used for linear regression model.

In [8]:
from word2number import w2n

In [9]:
df.experience = df.experience.apply(w2n.word_to_num)
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,8.0,7,72000
7,11,7.0,8,80000


In [10]:
reg = linear_model.LinearRegression()
reg.fit(df[['experience','test_score(out of 10)','interview_score(out of 10)']],df['salary($)'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
reg.predict([[2,9,6]])

array([53205.96797671])

Pickle is a library used to save the model. A model is trained on large set of data (in GBs). So it is wise to create a model and store it. When you need to make predictions, just load the file. 

In [12]:
import pickle

In [13]:
with open('/content/model','wb') as f:
  pickle.dump(reg,f)

In [16]:
with open('/content/model','rb') as f:
  model = pickle.load(f)

In [17]:
model.predict([[2,9,6]])

array([53205.96797671])

Joblib does exactly the same as pickle. Joblib should be used in case the code has multiple numpy arrays. Otherwise, joblib and pickle works exactly the same. 

In [19]:
from sklearn.externals import joblib



In [22]:
joblib.dump(reg,'/content/model_joblib')

['/content/model_joblib']

In [23]:
model_from_joblib = joblib.load('/content/model_joblib')

In [25]:
model_from_joblib.predict([[2,9,6]])

array([53205.96797671])

In [26]:
model_from_joblib.coef_

array([2812.95487627, 1845.70596798, 2205.24017467])