In [None]:
# Business Problem Statement:
'''Company XYZ wants to improve its employee performance prediction system.
They have collected data on several features such as education level, 
years of experience, training hours, and so on. The goal is to build
a machine learning model that can accurately predict employee performance.
The company believes that not all features are equally important, 
and they want to use Recursive Feature Elimination (RFE) to identify
the most relevant features for the prediction model. Once the relevant
features are identified, a model should be trained and tested.
'''
# using Recursive Feature Elimination (RFE) 

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [6]:
# datasets
np.random.seed(42)
data_size = 1000
features = {'education_level': np.random.randint(1,5,data_size),
            'years_of_experience': np.random.uniform(1,20,data_size),
           'training_hours': np.random.uniform(10,100,data_size),
           'communication_skills': np.random.randint(1,5,data_size),
           'problem_solving_skills': np.random.randint(1,5,data_size),
           'performance': np.random.uniform(50,100,data_size)}
df = pd.DataFrame(features)
print(df)

     education_level  years_of_experience  training_hours  \
0                  3            14.265073       56.717361   
1                  4            11.185831       53.126369   
2                  1             6.881025       12.307786   
3                  3            16.462105       40.712304   
4                  3            14.009892       44.217606   
..               ...                  ...             ...   
995                1            13.696043       54.309293   
996                1            12.770317       61.955113   
997                4             9.806387       87.901943   
998                4             8.215930       98.266541   
999                3            17.403339       46.682579   

     communication_skills  problem_solving_skills  performance  
0                       3                       3    64.325977  
1                       3                       1    90.132016  
2                       3                       1    99.861895  
3      

In [7]:
X = df.drop('performance', axis=1)
y = df['performance']

In [8]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [9]:
model = LinearRegression()
selector = RFE(model,n_features_to_select=3)
selector = selector.fit(X_train,y_train)

In [10]:
selected_features = X.columns[selector.support_]
print("Selected Features:",selected_features)

Selected Features: Index(['education_level', 'communication_skills', 'problem_solving_skills'], dtype='object')


In [11]:
# training
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

model = RandomForestRegressor(n_estimators = 100,random_state=42)
model.fit(X_train_selected,y_train)

In [12]:
y_pred = model.predict(X_test_selected)
mse = mean_squared_error(y_test,y_pred)
print("mean Squared Error:", mse)

mean Squared Error: 213.13058526024716


In [13]:
# user input
print("enter employee information for prediction:")
education_level = float(input("education level(1-4):"))
years_of_experience = float(input("years of experience:"))
training_hours = float(input("training hours:"))
communication_skills = float(input("communication skills(1-4):"))
problem_solving_skills = float(input("problem solving skills(1-4):"))

input_data = np.array([[education_level,years_of_experience,training_hours,communication_skills,problem_solving_skills]])
input_data_selected = selector.transform(input_data)
prediction = model.predict(input_data_selected)
print("Predicted performance is:", prediction[0])

enter employee information for prediction:
education level(1-4):2
years of experience:10
training hours:50
communication skills(1-4):3
problem solving skills(1-4):4
Predicted performance is: 68.87959964265852


