In [3]:
import pandas as pd
import numpy as np
import streamlit as st
from joblib import load
import dill

# Load the data from the CSV file
data = pd.read_csv('Employeee.csv')

# Print the data
data.head()

data.info()

data['LeaveOrNot']=data['LeaveOrNot'].map({1:'Yes',0:'No'})

# Splitting features and target variable
X = data.drop(columns=['LeaveOrNot'])
y = data['LeaveOrNot']

categorical_features = X.select_dtypes(include=[object]).columns

categorical_features= list(categorical_features.difference(['LeaveOrNot'])) 

print('\n','Categorical Features','\n', categorical_features,'\n')

numerical_features = list(X.select_dtypes(include=[np.float64,np.int64]).columns.difference(['EverBenched']))

print('\n','Numerical Features','\n', numerical_features,'\n')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4653 entries, 0 to 4652
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Education                  4653 non-null   object
 1   JoiningYear                4653 non-null   int64 
 2   City                       4653 non-null   object
 3   PaymentTier                4653 non-null   int64 
 4   Age                        4653 non-null   int64 
 5   Gender                     4653 non-null   object
 6   EverBenched                4653 non-null   object
 7   ExperienceInCurrentDomain  4653 non-null   int64 
 8   LeaveOrNot                 4653 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 327.3+ KB

 Categorical Features 
 ['City', 'Education', 'EverBenched', 'Gender'] 


 Numerical Features 
 ['Age', 'ExperienceInCurrentDomain', 'JoiningYear', 'PaymentTier'] 



In [4]:
from sklearn.model_selection import train_test_split

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('Train Data','\n',y_train.value_counts(normalize=True),'\n','\n','Test Data','\n', y_test.value_counts(normalize=True))

Train Data 
 LeaveOrNot
No     0.656368
Yes    0.343632
Name: proportion, dtype: float64 
 
 Test Data 
 LeaveOrNot
No     0.655209
Yes    0.344791
Name: proportion, dtype: float64


In [5]:
def summarize_cat(data,categorical_features):
  results=[]

  for column in data[categorical_features]:
      # Get the unique members of the column
      members = data[column].unique().tolist()
      # Append the column name and its unique members to the results list
      results.append([column, members])

  return pd.DataFrame(results, columns=['Column Name', 'Members'])

In [6]:
# Create a DataFrame from the results list
summarize_cat(X_train,categorical_features)

Unnamed: 0,Column Name,Members
0,City,"[New Delhi, Bangalore, Pune]"
1,Education,"[Masters, Bachelors, PHD]"
2,EverBenched,"[No, Yes]"
3,Gender,"[Male, Female]"


In [7]:
# EXPORTING FOR DE

my_feature_dict = {'CATEGORICAL' : summarize_cat(data,categorical_features).to_dict(), 'NUMERICAL' : {'Column Name': numerical_features}}

my_feature_dict.get('NUMERICAL')

{'Column Name': ['Age',
  'ExperienceInCurrentDomain',
  'JoiningYear',
  'PaymentTier']}

In [8]:
import pickle

# save dictionary to person_data.pkl file
with open('my_feature_dict.pkl', 'wb') as fp:
    pickle.dump(my_feature_dict, fp)
    print('dictionary saved successfully to file')

dictionary saved successfully to file


In [9]:
from sklearn.pipeline import Pipeline

# PREPROCESSING TRANSFORMATIONS ARE DONE ON EXAMPLE BASIS
# REAL WORLD SELECTION OF PREPROCSSING TRANSFORMATIONS MUST BE LOGICAL

transform_EverBenched = lambda x: x.assign(EverBenched=x['EverBenched'].map({1: 'Yes', 0: 'No'}))

from sklearn.preprocessing import FunctionTransformer

preprocessor_stage_1 = Pipeline(steps=[
    ('transform_sc', FunctionTransformer(transform_EverBenched)),
])

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

pipeline_num = Pipeline(steps=[
    ('scale_data', StandardScaler()),
    ('simple_imputer1', SimpleImputer(strategy='constant',fill_value=0)),
])

from sklearn.preprocessing import OneHotEncoder

pipeline_cat = Pipeline(steps=[
    ('OneHotEncode', OneHotEncoder(handle_unknown="ignore"))
])

from sklearn.compose import ColumnTransformer

preprocessor_stage_2 = ColumnTransformer(
    transformers=[
        ('cat', pipeline_cat, categorical_features),  # Categorical columns
        ('num', pipeline_num, numerical_features),     # Numerical columns
    ],remainder='drop')

preprocessor_stack = Pipeline(steps=[
    ('preprocessor_stage_1', preprocessor_stage_1),
    ('preprocessor_stage_2', preprocessor_stage_2)
])

In [10]:
preprocessor_stack

In [11]:
X_train.EverBenched.value_counts()

EverBenched
No     3349
Yes     373
Name: count, dtype: int64

In [12]:
preprocessor_stage_1.fit_transform(X_train).EverBenched.value_counts()

Series([], Name: count, dtype: int64)

In [13]:
preprocessor_stack.fit(X_train)

In [14]:
pd.DataFrame(preprocessor_stack.transform(X_train),columns=preprocessor_stack[-1].get_feature_names_out())

Unnamed: 0,cat__City_Bangalore,cat__City_New Delhi,cat__City_Pune,cat__Education_Bachelors,cat__Education_Masters,cat__Education_PHD,cat__EverBenched_nan,cat__Gender_Female,cat__Gender_Male,num__Age,num__ExperienceInCurrentDomain,num__JoiningYear,num__PaymentTier
0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.117526,-1.861550,-1.109740,0.539424
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,-0.914641,0.061305,-1.645675,0.539424
2,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,-0.088907,-0.579646,1.034001,-1.226396
3,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,-1.121074,-0.579646,-1.645675,0.539424
4,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.149693,-0.579646,1.034001,-1.226396
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3717,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.530393,-1.220598,-0.573805,0.539424
3718,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,-0.708208,0.702257,-1.109740,0.539424
3719,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.975427,-1.220598,0.498065,0.539424
3720,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,2.388294,-1.220598,-0.037870,0.539424


In [15]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_stack),
    ('classifier', RandomForestClassifier())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

In [16]:
# Checking Training Accuracy
y_train_pred = pipeline.predict(X_train)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_train,y_train_pred))
print("\nClassification Report:\n", classification_report(y_train,y_train_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_train,y_train_pred))

Accuracy: 0.9202041912950026

Classification Report:
               precision    recall  f1-score   support

          No       0.91      0.97      0.94      2443
         Yes       0.94      0.82      0.88      1279

    accuracy                           0.92      3722
   macro avg       0.92      0.90      0.91      3722
weighted avg       0.92      0.92      0.92      3722


Confusion Matrix:
 [[2371   72]
 [ 225 1054]]


In [17]:
# CREATING A TEST

my_pred_array=X_test.iloc[15:16:]

my_pred_array

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain
1128,Bachelors,2015,Pune,2,26,Female,No,4


In [18]:
pd.DataFrame(preprocessor_stack.transform(my_pred_array),columns=preprocessor_stack[1].get_feature_names_out())

Unnamed: 0,cat__City_Bangalore,cat__City_New Delhi,cat__City_Pune,cat__Education_Bachelors,cat__Education_Masters,cat__Education_PHD,cat__EverBenched_nan,cat__Gender_Female,cat__Gender_Male,num__Age,num__ExperienceInCurrentDomain,num__JoiningYear,num__PaymentTier
0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,-0.708208,0.702257,-0.03787,-1.226396


In [19]:
my_pred_array.to_json()

'{"Education":{"1128":"Bachelors"},"JoiningYear":{"1128":2015},"City":{"1128":"Pune"},"PaymentTier":{"1128":2},"Age":{"1128":26},"Gender":{"1128":"Female"},"EverBenched":{"1128":"No"},"ExperienceInCurrentDomain":{"1128":4}}'

In [20]:
# USING PIPELINE TO DO ALL TOGHETHER (PREPROCESSING FOLLOWED BY MODEL PREDICT)

# SINGLE PREDICTION

y_pred = pipeline.predict(my_pred_array)

y_pred

array(['Yes'], dtype=object)

In [21]:
# USING PIPELINE TO DO ALL TOGHETHER (PREPROCESSING FOLLOWED BY MODEL PREDICT)

# MULTIPLE PREDICTION
y_test_pred = pipeline.predict(X_test)

# EVALUATE MODEL FOR TEST ACCURACY SINCE WE HAVE TEST SET
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

Accuracy: 0.841031149301826

Classification Report:
               precision    recall  f1-score   support

          No       0.86      0.91      0.88       610
         Yes       0.81      0.71      0.75       321

    accuracy                           0.84       931
   macro avg       0.83      0.81      0.82       931
weighted avg       0.84      0.84      0.84       931


Confusion Matrix:
 [[556  54]
 [ 94 227]]


In [22]:
!pip install dill

Defaulting to user installation because normal site-packages is not writeable


In [23]:
import dill

# save trained pipeline file

with open('pipeline.pkl', 'wb') as file:
    dill.dump(pipeline, file)

print('pipeline saved successfully to file')

pipeline saved successfully to file


In [25]:
#Load the saved pipeline from the file

with open('pipeline.pkl', 'rb') as file:
    loaded_pipeline = dill.load(file)

print('pipeline loaded successfully to file')

pipeline loaded successfully to file


In [26]:
loaded_pipeline.__getstate__()

{'steps': [('preprocessor',
   Pipeline(steps=[('preprocessor_stage_1',
                    Pipeline(steps=[('transform_sc',
                                     FunctionTransformer(func=<function <lambda> at 0x000002092B121C60>))])),
                   ('preprocessor_stage_2',
                    ColumnTransformer(transformers=[('cat',
                                                     Pipeline(steps=[('OneHotEncode',
                                                                      OneHotEncoder(handle_unknown='ignore'))]),
                                                     ['City', 'Education',
                                                      'EverBenched', 'Gender']),
                                                    ('num',
                                                     Pipeline(steps=[('scale_data',
                                                                      StandardScaler()),
                                                                     ('si

In [27]:
y_pred = loaded_pipeline.predict(my_pred_array)

y_pred

array(['Yes'], dtype=object)