In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
area = [2600, 3000, 3200, 3600, 4000]
bedrooms = [3, 4, None, 3, 5]
age = [20, 15, 18, 30, 8]
price = [550, 565, 610, 595, 760]

In [17]:
df = pd.DataFrame({
    'area': area,
    'bedrooms': bedrooms,
    'age': age,
    'price': price
})

In [18]:
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550
1,3000,4.0,15,565
2,3200,,18,610
3,3600,3.0,30,595
4,4000,5.0,8,760


In [22]:
from sklearn import linear_model
import math

In [25]:
# whenever a data point is missing,
# taking the median of the whole column is a good start

# median
  # for odd numbers, it's the middle number
  # for even numbers, it's the average of the 2 middle numbers

med = df.bedrooms.median()
df.loc[2, "bedrooms"] = math.floor(med)

# or use fillna if multiple na values
# df.bedrooms = df.bedrooms.fillna(med)

In [26]:
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550
1,3000,4.0,15,565
2,3200,3.0,18,610
3,3600,3.0,30,595
4,4000,5.0,8,760


In [28]:
reg = linear_model.LinearRegression()
reg.fit(df[['area', 'bedrooms', 'age']], df.price)

In [29]:
reg.coef_

array([  0.13725, -26.025  ,  -6.825  ])

In [31]:
reg.intercept_

383.7249999999998

In [33]:
new_df = pd.DataFrame({
    'area': [2500, 3000],
    'bedrooms': [4, 3],
    'age': [5, 40]
})

In [34]:
new_df['predicted_price'] = reg.predict(new_df)

In [35]:
new_df

Unnamed: 0,area,bedrooms,age,predicted_price
0,2500,4,5,588.625
1,3000,3,40,444.4


# Multivariate Linear Regression Assignment
### Salary Predictor

In [60]:
experience = ['', '', 'five', 'two', 'seven', 'three', 'ten', 'eleven']
test_score = [8, 8, 6, 10, 9, 7, None, 7]
interview_score = [9, 6, 7, 10, 6, 10, 7, 8]
salary = [50000, 45000, 60000, 65000, 70000, 62000, 72000, 80000]

In [61]:
cand_df = pd.DataFrame({
    'experience': experience,
    'test_score': test_score,
    'interview_score': interview_score,
    'salary': salary
})

In [62]:
import math
test_score_med = math.floor(cand_df.test_score.median())
cand_df.test_score = cand_df.test_score.fillna(test_score_med)

In [63]:
!pip install word2number
from word2number import w2n



In [64]:
w2n.word_to_num('one')

# this function gives error when passed an empty string
# so convert all empty strings to zero

cand_df.experience = cand_df.experience.replace('', 'zero')
cand_df.experience = cand_df.experience.apply(w2n.word_to_num)

# cand_df.experience = cand_df.experience.apply(lambda x: w2n.word_to_num(x) if x != '' else 0)

In [65]:
cand_df

Unnamed: 0,experience,test_score,interview_score,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,8.0,7,72000
7,11,7.0,8,80000


In [66]:
cand_reg = linear_model.LinearRegression()
cand_reg.fit(cand_df[['experience', 'test_score', 'interview_score']], cand_df.salary)

In [67]:
new_candidates_df = pd.DataFrame({
    'experience': [2, 12, 0],
    'test_score': [9, 10, -10],
    'interview_score': [6, 10, 10]
})

In [68]:
new_candidates_df['predicted_salary'] = cand_reg.predict(new_candidates_df)

In [69]:
new_candidates_df

Unnamed: 0,experience,test_score,interview_score,predicted_salary
0,2,9,6,53205.967977
1,12,10,10,92002.183406
2,0,-10,10,21332.605531


In [70]:
# even with negative test scores, it gives a good salary.
# refine models in such edge cases

In [71]:
print(cand_reg.coef_)
print(cand_reg.intercept_)

[2812.95487627 1845.70596798 2205.24017467]
17737.263464337688


In [72]:
print(new_candidates_df)

   experience  test_score  interview_score  predicted_salary
0           2           9                6      53205.967977
1          12          10               10      92002.183406
2           0         -10               10      21332.605531
