In [1]:
import pandas as pd

# Loading the data in excel format since csv format showing error
data = pd.read_excel('train.xlsx')

# Displaying the first few rows of the dataset
data.head()


Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,fffe32003000360033003200,2008-09-30,Female,Service,No,2,3.0,3.8,0.16
1,fffe3700360033003500,2008-11-30,Male,Service,Yes,1,2.0,5.0,0.36
2,fffe31003300320037003900,2008-03-10,Female,Product,Yes,2,,5.8,0.49
3,fffe32003400380032003900,2008-11-03,Male,Service,Yes,1,1.0,2.6,0.2
4,fffe31003900340031003600,2008-07-24,Female,Service,No,3,7.0,6.9,0.52


In [2]:
from sklearn.preprocessing import LabelEncoder #label encoder encodes the object as numerical values
from sklearn.impute import SimpleImputer #importing imputer for substitute missing values with mean values

# Droping rows with missing target values
data = data.dropna(subset=['Burn Rate'])

# Filling missing values in 'Resource Allocation' and 'Mental Fatigue Score' with their respective means
imputer = SimpleImputer(strategy='mean')
data['Resource Allocation'] = imputer.fit_transform(data[['Resource Allocation']])
data['Mental Fatigue Score'] = imputer.fit_transform(data[['Mental Fatigue Score']])

# Encoding object variables as numerical values
label_encoder = LabelEncoder()
data['Gender'] = label_encoder.fit_transform(data['Gender'])
data['Company Type'] = label_encoder.fit_transform(data['Company Type'])
data['WFH Setup Available'] = label_encoder.fit_transform(data['WFH Setup Available'])

# Droping 'Employee ID' and 'Date of Joining' as they are not usefull for prediction
data = data.drop(columns=['Employee ID', 'Date of Joining'])

# Displaying preprocessed data to verify all the columns
data.head()


Unnamed: 0,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,0,1,0,2,3.0,3.8,0.16
1,1,1,1,1,2.0,5.0,0.36
2,0,0,1,2,4.483831,5.8,0.49
3,1,1,1,1,1.0,2.6,0.2
4,0,1,0,3,7.0,6.9,0.52


In [3]:
from sklearn.model_selection import train_test_split

# Defining inputs(X) and output(y)
X = data.drop(columns=['Burn Rate'])
y = data['Burn Rate']

# Spliting the data into training and testing sets into 80-20 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=29)


In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

# training a model
model = RandomForestRegressor(random_state=29)
model.fit(X_train, y_train)

# storing predictions
y_pred = model.predict(X_test)

# Evaluating the score of predicted data and test data 
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2


(np.float64(0.003977563106978618), 0.8996695982302478)