In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

df = pd.read_csv('employeee.csv')

In [2]:
# Dropping 'employeeid' as it is not relevant for prediction
df.drop('employeeid', axis=1, inplace=True)

# Encoding categorical variables
label_encoders = {}
for column in ['education', 'city', 'gender', 'everbenched', 'department']:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

# Splitting the data into features and target
X = df.drop('offtime', axis=1)
y = df['offtime']

# Ensure `y` is numeric and of the correct type
y = y.astype(int)  # Assuming `y` is not already numeric

# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
# Selecting a RandomForestRegressor model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)


In [4]:
model2= LinearRegression()

In [5]:
model2.fit(X_train,y_train)

In [6]:
# Predicting the off time on the test set
y_pred = model2.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 3.833299787859581


In [7]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)

In [8]:
model3= DecisionTreeClassifier()

In [9]:
model3.fit(X_train,y_train)

In [10]:
# Predicting the off time on the test set
y_pred = model3.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 5.332449353881183


In [11]:
# Using the model to predict off time for new data
new_data = pd.DataFrame({
    'education': ['Masters'],
    'joiningyear': [2015],
    'city': ['Lalpur'],
    'paymenttier': [2],
    'age': [31],
    'gender': ['Female'],
    'everbenched': ['No'],
    'experienceincurrentdomain': [6],
    'department': ['IT']
})

# Encoding the new data
for column in ['education', 'city', 'gender', 'everbenched', 'department']:
    new_data[column] = label_encoders[column].transform(new_data[column])

# Predicting the off time
new_prediction = model2.predict(new_data)
print(f'Predicted off time: {new_prediction[0]}')

Predicted off time: 5.942911142560497


In [12]:
print(y.dtypes)
print(y.unique())


int64
[ 7  2 11  8  4  9 12 10  6  0  5  3  1]


In [13]:
import joblib

# Assuming `model` is your trained RandomForestRegressor model
model_file = 'linear_reg_model.pkl'
joblib.dump(model2, model_file)
print(f'Model saved as {model_file}')


Model saved as linear_reg_model.pkl
