Question#3: Use Placement_Data_Full_Class.csv dataset and try to predict the salary of the candidate, if the candidate is offered a job

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from csv import reader
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, r2_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [2]:
#loading file
filename = 'data/Placement_Data_Full_Class.csv' 
dataset = pd.read_csv(filename, header=0)
dataset.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [3]:
#droping salary and sl_no columns are not required for placement status prediction
dataset.drop(['sl_no'], axis=1, inplace=True)

In [4]:
#checking missing values
dataset.isnull().sum()

gender             0
ssc_p              0
ssc_b              0
hsc_p              0
hsc_b              0
hsc_s              0
degree_p           0
degree_t           0
workex             0
etest_p            0
specialisation     0
mba_p              0
status             0
salary            67
dtype: int64

In [7]:
encoded_dataset = pd.get_dummies(dataset, columns = ['hsc_s', 'degree_t'])
encoded_dataset

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,degree_p,workex,etest_p,specialisation,mba_p,status,salary,hsc_s_Arts,hsc_s_Commerce,hsc_s_Science,degree_t_Comm&Mgmt,degree_t_Others,degree_t_Sci&Tech
0,M,67.00,Others,91.00,Others,58.00,No,55.0,Mkt&HR,58.80,Placed,270000.0,0,1,0,0,0,1
1,M,79.33,Central,78.33,Others,77.48,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0,0,0,1,0,0,1
2,M,65.00,Central,68.00,Central,64.00,No,75.0,Mkt&Fin,57.80,Placed,250000.0,1,0,0,1,0,0
3,M,56.00,Central,52.00,Central,52.00,No,66.0,Mkt&HR,59.43,Not Placed,,0,0,1,0,0,1
4,M,85.80,Central,73.60,Central,73.30,No,96.8,Mkt&Fin,55.50,Placed,425000.0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,M,80.60,Others,82.00,Others,77.60,No,91.0,Mkt&Fin,74.49,Placed,400000.0,0,1,0,1,0,0
211,M,58.00,Others,60.00,Others,72.00,No,74.0,Mkt&Fin,53.62,Placed,275000.0,0,0,1,0,0,1
212,M,67.00,Others,67.00,Others,73.00,Yes,59.0,Mkt&Fin,69.72,Placed,295000.0,0,1,0,1,0,0
213,F,74.00,Others,66.00,Others,58.00,No,70.0,Mkt&HR,60.23,Placed,204000.0,0,1,0,1,0,0


In [8]:
# encoding string columns to 0,1,2.. 
encoder = LabelEncoder()
columns_to_encode = ['gender','ssc_b', 'hsc_b','workex','specialisation','status']
for column in columns_to_encode:
    encoded_dataset[column] = encoder.fit_transform(encoded_dataset[column])
encoded_dataset.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,degree_p,workex,etest_p,specialisation,mba_p,status,salary,hsc_s_Arts,hsc_s_Commerce,hsc_s_Science,degree_t_Comm&Mgmt,degree_t_Others,degree_t_Sci&Tech
0,1,67.0,1,91.0,1,58.0,0,55.0,1,58.8,1,270000.0,0,1,0,0,0,1
1,1,79.33,0,78.33,1,77.48,1,86.5,0,66.28,1,200000.0,0,0,1,0,0,1
2,1,65.0,0,68.0,0,64.0,0,75.0,0,57.8,1,250000.0,1,0,0,1,0,0
3,1,56.0,0,52.0,0,52.0,0,66.0,1,59.43,0,,0,0,1,0,0,1
4,1,85.8,0,73.6,0,73.3,0,96.8,0,55.5,1,425000.0,0,1,0,1,0,0


In [13]:
encoded_dataset.dropna(subset=['salary'], inplace = True, how='any')
encoded_dataset.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,degree_p,workex,etest_p,specialisation,mba_p,status,salary,hsc_s_Arts,hsc_s_Commerce,hsc_s_Science,degree_t_Comm&Mgmt,degree_t_Others,degree_t_Sci&Tech
0,1,67.0,1,91.0,1,58.0,0,55.0,1,58.8,1,270000.0,0,1,0,0,0,1
1,1,79.33,0,78.33,1,77.48,1,86.5,0,66.28,1,200000.0,0,0,1,0,0,1
2,1,65.0,0,68.0,0,64.0,0,75.0,0,57.8,1,250000.0,1,0,0,1,0,0
4,1,85.8,0,73.6,0,73.3,0,96.8,0,55.5,1,425000.0,0,1,0,1,0,0
7,1,82.0,0,64.0,0,66.0,1,67.0,0,62.14,1,252000.0,0,0,1,0,0,1


In [15]:
X = encoded_dataset.loc[:, encoded_dataset.columns!='salary']
y=encoded_dataset.loc[:, 'salary']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)
X_train.shape, X_test.shape

((133, 17), (15, 17))

# LINEAR REGRESSION

In [51]:
linear = LinearRegression(copy_X=True,fit_intercept=True,normalize=False,n_jobs=None).fit(X_train, y_train)

In [52]:
y_pred = linear.predict(X_test)
y_pred

array([312068.28388832, 340135.26148319, 291659.5623686 , 307256.0099765 ,
       257510.13747892, 247967.82446775, 257901.41171489, 293409.38903178,
       319331.18367557, 314733.0022903 , 320285.32429822, 311386.39063225,
       318318.64724552, 261676.78060867, 218658.28344846])

In [53]:
y_test[:10]

178    350000.0
74     336000.0
203    260000.0
28     350000.0
145    400000.0
20     265000.0
112    250000.0
48     250000.0
117    240000.0
15     200000.0
Name: salary, dtype: float64

In [54]:
r2_score(y_test, y_pred)

-0.06270517303525946

In [55]:
mean_squared_error(y_test, y_pred, squared=False)

67394.23578479311