In [132]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.model_selection import cross_val_score,KFold

In [133]:
df=pd.read_csv("hiring.csv")
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [134]:
df.count()

experience                    6
test_score(out of 10)         7
interview_score(out of 10)    8
salary($)                     8
dtype: int64

In [135]:
df["experience"].fillna("ZERO",inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["experience"].fillna("ZERO",inplace=True)


In [136]:
word_to_num={"zero":0,"one":1,"two":2,"three":3,"four":4,"five":5,"six":6,"seven":7,"eight":8,"nine":9,"None":0,"unknown":0}

In [137]:
def clean(value):
    if pd.isna(value):#handle Nan
        return 0
    value=str(value).lower().strip()
    if value in word_to_num:
        return word_to_num[value]
    match=re.search(r'\d+',value)
    if match:
        return int(match.group)
    return 0
        
    

In [138]:
df["experience"]=df["experience"].apply(clean)

In [139]:
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,0,,7,72000
7,0,7.0,8,80000


In [140]:
df["test_score(out of 10)"].fillna(df["test_score(out of 10)"].median(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["test_score(out of 10)"].fillna(df["test_score(out of 10)"].median(),inplace=True)


In [141]:
df


Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,0,8.0,7,72000
7,0,7.0,8,80000


In [142]:
df.count()

experience                    8
test_score(out of 10)         8
interview_score(out of 10)    8
salary($)                     8
dtype: int64

In [143]:
df.isnull()

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
5,False,False,False,False
6,False,False,False,False
7,False,False,False,False


In [144]:
X=df.drop('salary($)',axis="columns")

In [145]:
X

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10)
0,0,8.0,9
1,0,8.0,6
2,5,6.0,7
3,2,10.0,10
4,7,9.0,6
5,3,7.0,10
6,0,8.0,7
7,0,7.0,8


In [146]:
y=df.iloc[:,-1]

In [147]:
y

0    50000
1    45000
2    60000
3    65000
4    70000
5    62000
6    72000
7    80000
Name: salary($), dtype: int64

In [148]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=2)

In [149]:
lr=LinearRegression()
lr.fit(X_train,y_train)

In [150]:
y_pred=lr.predict(X_test)

In [151]:
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

-0.6981922134340424
256238336.20482552
12408.277404921697


In [152]:
l=LinearRegression()

In [153]:
kf=KFold(n_splits=5, shuffle=True, random_state=None)
cross_val_score(l,X,y)



array([-131.53097667, -257.586938  ,  -27.05537258,           nan,
                 nan])

In [154]:
lr.predict([[2,9,6]])



array([68442.11409396])

In [156]:
model=LinearRegression()
model.fit(X,y)

In [157]:
model.predict([[2,9,6]])



array([61874.41645719])

In [158]:
model.predict([[12,10,10]])



array([71651.05422459])