In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier  

In [20]:
import pandas as pd

# Load the CSV file using the full path
file_path = r"C:\Users\Mbete\Downloads\145434_project2\dataset\WA_Fn-UseC_-HR-Employee-Attrition.csv"
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [21]:
# Display basic information about the dataset
print("First few rows of the dataset:")
print(df.head())
print("\nDataset summary:")
print(df.info())


First few rows of the dataset:
   Age Attrition     BusinessTravel  DailyRate              Department  \
0   41       Yes      Travel_Rarely       1102                   Sales   
1   49        No  Travel_Frequently        279  Research & Development   
2   37       Yes      Travel_Rarely       1373  Research & Development   
3   33        No  Travel_Frequently       1392  Research & Development   
4   27        No      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1        Medical              1               7   

   ...  RelationshipSatisfaction StandardHo

In [22]:
# Separate numeric and categorical columns
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Fill missing values
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

# Encode categorical variables
label_encoders = {}
for column in categorical_cols:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

# Check if the missing values are handled
print("Missing values after filling:")
print(df.isnull().sum())


Missing values after filling:
Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCur

In [23]:
# Define features (X) and target (y)
target_column = 'Attrition'  # Update if needed
if target_column not in df.columns:
    print(f"Error: Target column '{target_column}' not found in dataset.")
else:
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    print("Features and target variable defined successfully.")

Features and target variable defined successfully.


In [24]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training and testing sets created.")

Training and testing sets created.


In [25]:
# Standardize features for models that benefit from scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print("Feature scaling applied.")

Feature scaling applied.


In [26]:
# Train and evaluate Decision Tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

print("Decision Tree Classifier Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

# Train and evaluate SVM model
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

print("SVM Classifier Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

# Train and evaluate MLP model
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300)
mlp_model.fit(X_train, y_train)
y_pred_mlp = mlp_model.predict(X_test)

print("Neural Network Classifier Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_mlp))
print(classification_report(y_test, y_pred_mlp))

# Train and evaluate XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

print("XGBoost Classifier Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))



Decision Tree Classifier Results:
Accuracy: 0.7721088435374149
              precision    recall  f1-score   support

           0       0.87      0.86      0.87       255
           1       0.17      0.18      0.17        39

    accuracy                           0.77       294
   macro avg       0.52      0.52      0.52       294
weighted avg       0.78      0.77      0.78       294

SVM Classifier Results:
Accuracy: 0.8843537414965986
              precision    recall  f1-score   support

           0       0.88      1.00      0.94       255
           1       1.00      0.13      0.23        39

    accuracy                           0.88       294
   macro avg       0.94      0.56      0.58       294
weighted avg       0.90      0.88      0.84       294

Neural Network Classifier Results:
Accuracy: 0.8605442176870748
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       255
           1       0.47      0.41      0.44        39

  

Parameters: { "use_label_encoder" } are not used.



In [27]:
# Cross-validation for each model
print("\nCross-Validation Scores (5-Fold):")

cv_scores_dt = cross_val_score(dt_model, X_train, y_train, cv=5)
print("Decision Tree:", cv_scores_dt, "Mean:", cv_scores_dt.mean())

cv_scores_svm = cross_val_score(svm_model, X_train, y_train, cv=5)
print("SVM:", cv_scores_svm, "Mean:", cv_scores_svm.mean())

cv_scores_mlp = cross_val_score(mlp_model, X_train, y_train, cv=5)
print("Neural Network:", cv_scores_mlp, "Mean:", cv_scores_mlp.mean())

cv_scores_xgb = cross_val_score(xgb_model, X_train, y_train, cv=5)
print("XGBoost:", cv_scores_xgb, "Mean:", cv_scores_xgb.mean())


Cross-Validation Scores (5-Fold):
Decision Tree: [0.79661017 0.8        0.78297872 0.81276596 0.7787234 ] Mean: 0.7942156509195817
SVM: [0.84322034 0.85957447 0.84255319 0.85106383 0.85106383] Mean: 0.8494951316263973


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Neural Network: [0.81779661 0.84255319 0.82978723 0.80851064 0.84255319] Mean: 0.828240173097728
XGBoost: [0.86016949 0.86808511 0.85531915 0.83404255 0.86382979] Mean: 0.856289217454021


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [28]:
# Decision Tree Hyperparameter Tuning
dt_params = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
grid_dt = GridSearchCV(DecisionTreeClassifier(), dt_params, cv=5)
grid_dt.fit(X_train, y_train)
print("Best Params for Decision Tree:", grid_dt.best_params_)
print("Best Score:", grid_dt.best_score_)

# SVM Hyperparameter Tuning
svm_params = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}
grid_svm = GridSearchCV(SVC(), svm_params, cv=5)
grid_svm.fit(X_train, y_train)
print("Best Params for SVM:", grid_svm.best_params_)
print("Best Score:", grid_svm.best_score_)

# XGBoost Hyperparameter Tuning
xgb_params = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 150]
}
grid_xgb = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgb_params, cv=5)
grid_xgb.fit(X_train, y_train)
print("Best Params for XGBoost:", grid_xgb.best_params_)
print("Best Score:", grid_xgb.best_score_)


Best Params for Decision Tree: {'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 2}
Best Score: 0.8384204832311575
Best Params for SVM: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best Score: 0.8580021637216012


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best Params for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Best Score: 0.8588496213487199


In [29]:
import joblib

best_xgb_model = grid_xgb.best_estimator_
joblib.dump(best_xgb_model, 'best_xgb_model.pkl')


['best_xgb_model.pkl']

In [1]:
from flask import Flask, request, jsonify
import joblib
import numpy as np

# Load the XGBoost model
model = joblib.load('best_xgb_model.pkl')

app = Flask(__name__)

# API endpoint for predicting employee attrition
@app.route('/predict', methods=['POST'])
def predict():
    try:
        data = request.get_json(force=True)
        features = data.get('features')
        
        # Validate that 'features' is provided and is a list
        if features is None or not isinstance(features, list):
            return jsonify({'error': 'Invalid input. Expected "features" key with a list of values.'}), 400

        # Perform prediction
        prediction = model.predict([features])
        return jsonify({'prediction': int(prediction[0])})
    
    except Exception as e:
        return jsonify({'error': str(e)}), 500

if __name__ == '__main__':
    app.run(port=5000, debug=True)

ModuleNotFoundError: No module named 'xgboost'

In [None]:
curl -X POST http://127.0.0.1:5000/predict -H "Content-Type: application/json" -d '{"features": [0, 5, 1, 40, 20000, 5]}'