In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model


In [15]:
homes = {
    'Area': [2600,3000,3200,3600,4000],
    'bedrooms':[3,4,np.nan,3,5],
    'age':[20,15,18,30,8],
    'prices':[550000,565000,610000,595000,760000]
}
homes_df = pd.DataFrame(homes)
homes_df

Unnamed: 0,Area,bedrooms,age,prices
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000


#### Data PreProcessing For nan value

In [24]:
import math

median_bedrooms = math.floor(homes_df.bedrooms.median())#median bedrooms value to replace nan and math.floor function to convert into integers
median_bedrooms

3

In [32]:
homes_df.fillna(median_bedrooms,inplace=True)#filling na values with median rooms andd inplce trues to permanently assign values to original df
homes_df

Unnamed: 0,Area,bedrooms,age,prices
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,3.0,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000


### Data Modelling for Linear Regression

In [37]:
reg = linear_model.LinearRegression()
#reg.fit(dataframe[[independent_variables]],dataframe.dependent_variable)
reg.fit(homes_df[['Area','bedrooms','age']],homes_df.prices)

In [45]:
print('The coefficient or (m1, m2, m3)')
print(reg.coef_)
print('The intercept or b')
print(reg.intercept_)

The coefficient or (m1, m2, m3)
[   137.25 -26025.    -6825.  ]
The intercept or b
383724.9999999998


#### Predicting the prices

In [54]:
#reg.preedict([['area','bedrooms','age']])
reg.predict([[3000,3,40]])
#price = m1*area+m2*bedrooms+m3*age+intercept




array([444400.])

In [56]:
reg.predict([[3200,3,18]])



array([622000.])

#### Question

In [88]:
#3000 3 40
#2500 4 5
predict_data = {
    'area':[2500,3000,3500,4400],
    'rooms':[3,4,6,7],
    'age':[40,5,3,6]
}
predict_dfs = pd.DataFrame(predict_data)
predict_dfs.columns = ['Area','bedrooms','age']
predict_dfs

Unnamed: 0,Area,bedrooms,age
0,2500,3,40
1,3000,4,5
2,3500,6,3
3,4400,7,6


In [90]:
price_predict = reg.predict(predict_dfs)#predicting the price based on the dataframe

In [92]:
predict_dfs['predicted_price'] = price_predict#assigning output to the new column predicted_price

In [100]:
predict_dfs.rename(columns={'predicted_price':'prices'},inplace=True)
predict_dfs

Unnamed: 0,Area,bedrooms,age,prices
0,2500,3,40,375775.0
1,3000,4,5,657250.0
2,3500,6,3,687475.0
3,4400,7,6,764500.0


In [108]:
result_df = pd.concat([homes_df,predict_dfs],ignore_index=True)
result_df = result_df.sort_values(by='prices',ascending=True,ignore_index=True)
result_df

Unnamed: 0,Area,bedrooms,age,prices
0,2500,3.0,40,375775.0
1,2600,3.0,20,550000.0
2,3000,4.0,15,565000.0
3,3600,3.0,30,595000.0
4,3200,3.0,18,610000.0
5,3000,4.0,5,657250.0
6,3500,6.0,3,687475.0
7,4000,5.0,8,760000.0
8,4400,7.0,6,764500.0


### **EXERCISE**
- In exercise folder (same level as this notebook on github) there is **hiring.csv**. This file contains hiring statics for a firm such as experience of candidate, his written test score and personal interview score. Based on these 3 factors, HR will decide the salary. Given this data, you need to build a machine learning model for HR department that can help them decide salaries for future candidates. Using this predict salaries for following candidates,
-  **2 yr experience, 9 test score, 6 interview score**
-  **12 yr experience, 10 test score, 10 interview score**
#### **Answer**
- 53713.86 and 93747.79

### **Data PreProcessing**
***Importing the Hiring.csv file***


In [139]:
hiring_csv = pd.read_csv(r'D:\Works\Artificial Intellegence\Machine Learning\hiring.csv')
print(type(hiring_csv))
hiring_csv

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [145]:
#filling na values with zero in experience
hiring_csv.experience = hiring_csv['experience'].fillna('zero')
hiring_csv


Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,zero,8.0,9,50000
1,zero,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [150]:
#renaming column names
hiring_csv = hiring_csv.rename(columns={'test_score(out of 10)':'test_score(10)','interview_score(out of 10)':'interview_score','salary($)':'salary'})
hiring_csv

Unnamed: 0,experience,test_score(10),interview_score,salary
0,zero,8.0,9,50000
1,zero,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [162]:
#fillng test score na with median
test_score_median = hiring_csv['test_score(10)'].median()
print('median of the test score ',test_score_median)
hiring_csv['test_score(10)'] = hiring_csv['test_score(10)'].fillna(test_score_median)
hiring_csv

median of the test score  8.0


Unnamed: 0,experience,test_score(10),interview_score,salary
0,zero,8.0,9,50000
1,zero,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,8.0,7,72000
7,eleven,7.0,8,80000


In [164]:
#converting word numbers into integers
!pip install word2number

Collecting word2number
  Downloading word2number-1.1.zip (9.7 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: word2number
  Building wheel for word2number (setup.py): started
  Building wheel for word2number (setup.py): finished with status 'done'
  Created wheel for word2number: filename=word2number-1.1-py3-none-any.whl size=5589 sha256=39beeb1dc65f4aa2c4ee20553f00e4f68b3c10b277e9e5efd7ad6919732fb90d
  Stored in directory: c:\users\workstation\appdata\local\pip\cache\wheels\5b\79\fb\d25928e599c7e11fe4e00d32048cd74933f34a74c633d2aea6
Successfully built word2number
Installing collected packages: word2number
Successfully installed word2number-1.1


In [170]:
from word2number import w2n
word_number = "twenty five"
numeric_value = w2n.word_to_num(word_number)
print(numeric_value)  # Output: 25

25


In [174]:
#converting word into integers
hiring_csv.experience = hiring_csv['experience'].apply(lambda x: w2n.word_to_num(x) if isinstance(x,str) else x)
hiring_csv

Unnamed: 0,experience,test_score(10),interview_score,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,8.0,7,72000
7,11,7.0,8,80000


### Linear Regression Modelling

In [189]:
hiring_reg = linear_model.LinearRegression()
#idependent variables or features = experience,test_score,interview_score
#dependent variable = salary
#hiring_reg.fit(df[[independent_variables]],df.dependent_variables)
hiring_reg.fit(hiring_csv[['experience','test_score(10)','interview_score']],hiring_csv.salary)

In [199]:
#back to the quwstion
#2 yr experience, 9 test score, 6 interview score
#12 yr experience, 10 test score, 10 interview score
predicting_salary = {
    'experience':[2,12,20,4],
    'test_score(10)':[9,10,10,10],
    'interview_score':[6,10,8,10]
}
predicting_dfs = pd.DataFrame(predicting_salary)
predicting_dfs

Unnamed: 0,experience,test_score(10),interview_score
0,2,9,6
1,12,10,10
2,20,10,8
3,4,10,10


In [201]:
predicting_dfs['salary'] = hiring_reg.predict(predicting_dfs)


In [203]:
predicting_dfs

Unnamed: 0,experience,test_score(10),interview_score,salary
0,2,9,6,53205.967977
1,12,10,10,92002.183406
2,20,10,8,110095.342067
3,4,10,10,69498.544396


In [213]:
final_csv = pd.concat([predicting_dfs,hiring_csv],ignore_index=True)
final_csv = final_csv.sort_values(by='salary',ascending=False,ignore_index=True)
final_csv

Unnamed: 0,experience,test_score(10),interview_score,salary
0,20,10.0,8,110095.342067
1,12,10.0,10,92002.183406
2,11,7.0,8,80000.0
3,10,8.0,7,72000.0
4,7,9.0,6,70000.0
5,4,10.0,10,69498.544396
6,2,10.0,10,65000.0
7,3,7.0,10,62000.0
8,5,6.0,7,60000.0
9,2,9.0,6,53205.967977


In [215]:
#converting into csv file
final_csv.to_csv('Teacher_salary.csv',index=False)

### **Conclusion** 
- Experience Matter the Most
- Freshers With 10 in both test Score and interview Score but less experience is just an average teacher