Here we will just create a basic model with the data we have.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import scipy as sc
import xgboost as xgb
import itertools

In [2]:
from xgboost import XGBClassifier

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv('web_dev_segmentation.csv')

In [5]:
data.head()

Unnamed: 0,customer,project,total_hours,discount%,weekday,hour,num_projects,Frontend%,Backend%,UI/UX%,DevOps%,Maintenance%,Testing%,Consulting%,Security%,class,labels
0,0,0,34.97,10.49,2,14,22.0,0.32,24.53,5.0,18.72,11.79,8.47,11.3,19.86,enterprise_clients,9
1,1,1,28.62,9.68,4,13,4.0,8.78,9.41,11.62,8.88,0.37,17.94,17.31,25.68,mixed_services,3
2,2,2,36.48,14.76,2,22,2.0,14.95,1.67,18.11,11.1,12.05,6.66,19.57,15.88,mixed_services,2
3,3,3,45.23,17.66,5,20,2.0,23.82,7.35,9.91,8.28,19.82,7.85,11.57,11.39,mixed_services,2
4,4,4,27.66,13.43,6,18,1.0,3.6,17.41,16.96,14.03,9.73,19.67,10.12,8.49,mixed_services,0


In [6]:
data.describe()

Unnamed: 0,customer,project,total_hours,discount%,weekday,hour,num_projects,Frontend%,Backend%,UI/UX%,DevOps%,Maintenance%,Testing%,Consulting%,Security%,labels
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,24999.5,24999.5,30.017719,10.051385,3.99626,15.51812,10.04024,12.514764,12.435788,12.531118,12.550743,12.490497,12.466345,12.48751,12.523213,4.5
std,14433.901067,14433.901067,9.939912,4.904215,1.998501,4.611817,9.674036,8.690937,8.655,8.702468,8.687924,8.681824,8.70675,8.678014,8.70543,2.87231
min,0.0,0.0,5.0,0.0,1.0,8.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12499.75,12499.75,23.21,6.65,2.0,12.0,3.0,5.89,5.81,5.89,5.84,5.85,5.82,5.88,5.84,2.0
50%,24999.5,24999.5,30.02,10.02,4.0,16.0,7.0,12.06,11.99,12.07,12.13,12.11,12.05,12.08,12.1,4.5
75%,37499.25,37499.25,36.77,13.38,6.0,19.0,14.0,18.1725,18.02,18.15,18.28,18.14,18.09,18.08,18.17,7.0
max,49999.0,49999.0,74.79,25.0,7.0,23.0,50.0,71.01,100.0,73.07,76.48,93.83,73.28,80.48,68.14,9.0


In [7]:
y = data['labels'].values
X = data.drop(['weekday', 'hour','labels', 'class'], axis=1).values

In [8]:
from sklearn.model_selection import StratifiedKFold

In [9]:
skf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=42)

In [10]:
xgbc = XGBClassifier(objective='multi:softprob', random_state=42)

In [11]:
cv_scores = []
for train_index, test_index in skf.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    xgbc.fit(X_train, y_train)
    score = xgbc.score(X_test, y_test)
    print(score)
    cv_scores.append(score)

0.9921
0.992
0.9912
0.9933
0.9914


That's not a bad score, let's try to improve it by tuning parameters.

In [12]:
from sklearn.model_selection import GridSearchCV

In [13]:
params = {
    'max_depth': [6, 7],
    'learning_rate': [0.05],
    'n_estimators': [500],
    'gamma': [0],
    'max_delta_step': [1],
    'subsample': [0.9, 0.8],
    'colsample_bytree': [1.0],
    'colsample_bylevel': [1.0],
    'min_child_weight': [1.0]
}

In [14]:
base_model = XGBClassifier(
    objective='multi:softprob',
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss',
    tree_method='hist'  # Recommended for most scenarios
)

In [15]:
grid_search_xgb = GridSearchCV(
    estimator=base_model,
    param_grid=params,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1,
    verbose=1
)

In [16]:
grid_search_xgb.fit(X, y)
print("Best parameters:", grid_search_xgb.best_params_)
print("Best cross-validation score:", grid_search_xgb.best_score_)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best parameters: {'colsample_bylevel': 1.0, 'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.05, 'max_delta_step': 1, 'max_depth': 6, 'min_child_weight': 1.0, 'n_estimators': 500, 'subsample': 0.9}
Best cross-validation score: 0.9924800000000001


In [17]:
results = pd.DataFrame(grid_search_xgb.cv_results_)
results = pd.concat([results, pd.DataFrame(grid_search_xgb.cv_results_)], axis=0)

In [18]:
results[results['mean_test_score']==results['mean_test_score'].max()].T

Unnamed: 0,0,0.1
mean_fit_time,81.554574,81.554574
std_fit_time,4.167276,4.167276
mean_score_time,2.058414,2.058414
std_score_time,0.140242,0.140242
param_colsample_bylevel,1.0,1.0
param_colsample_bytree,1.0,1.0
param_gamma,0,0
param_learning_rate,0.05,0.05
param_max_delta_step,1,1
param_max_depth,6,6


In [19]:
selected_xgbc = XGBClassifier(learning_rate=0.05, max_depth=6, n_estimators=500, subsample=0.9)

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1)

In [22]:
selected_xgbc.fit(X_train, y_train)

In [23]:
selected_xgbc.score(X_train, y_train)

1.0

In [24]:
selected_xgbc.score(X_test, y_test)

0.9942

Well, That's an improvement! This model can be further used to predict the classes of customers.