In [1]:
import pandas as pd
import pyodbc

# Connection details — update as needed
server = 'DESKTOP-I9NMGNC'       # e.g. 'localhost\\SQLEXPRESS'
database = 'DW_Finance'
username = 'amin'
password = 'amin'

# Create the connection string
conn_str = (
    f'DRIVER={{ODBC Driver 17 for SQL Server}};'
    f'SERVER={server};'
    f'DATABASE={database};'
    f'UID={username};'
    f'PWD={password}'
)

# Establish the connection
conn = pyodbc.connect(conn_str)

# Define the SQL query
sql_query = """
SELECT 
    f.Fk_Produit,
    f.Fk_Invoices,
    f.Fk_Supplier,
    f.Fk_Date,
    f.Fk_Geographie,
    f.Fk_InvoiceDate,
    f.Fk_DueDate,
    f.Fk_PaymentDate,
    f.Price,
    f.Amount,
    f.DiscountOffered,
    f.RecommendedProfitMargin,
    d.PaymentDate,  -- from Dim_Supplier
    d.DueDate       -- from Dim_Supplier
FROM 
    DW_Finance.dbo.Fact_Purchase f
JOIN 
    DW_Finance.dbo.Dim_Supplier d
    ON f.Fk_Supplier = d.Pk_Supplier
"""

# Run the query and load into DataFrame
df = pd.read_sql(sql_query, conn)

# Show the first few rows
print(df.head())

# Close the connection
conn.close()


   Fk_Produit  Fk_Invoices  Fk_Supplier  Fk_Date  Fk_Geographie  \
0         996         7256          243     8776             78   
1         760         3624          183    10456             18   
2         771         6001          189     8929             24   
3         871          552          219     9008             54   
4         981          638          240     9721             75   

   Fk_InvoiceDate  Fk_DueDate  Fk_PaymentDate  Price   Amount  \
0            8474        9291            8776    3.0  4133.94   
1            9166        8729           10456    2.5  3941.63   
2            8674        8775            8929    1.8  3081.99   
3            9628        9331            9008   22.0   820.28   
4            9230        9312            9721    7.0  1836.40   

   DiscountOffered  RecommendedProfitMargin PaymentDate     DueDate  
0           488.75                     45.0  2021-01-11  2022-06-10  
1            88.94                     40.0  2025-08-18  2020-11-2

  df = pd.read_sql(sql_query, conn)


In [2]:
# Create the LatePayment column: 1 if payment is late, 0 if on time
df['LatePayment'] = (df['PaymentDate'] > df['DueDate']).astype(int)

# Display the updated data with the new column
print(df[['PaymentDate', 'DueDate', 'LatePayment']].head())


  PaymentDate     DueDate  LatePayment
0  2021-01-11  2022-06-10            0
1  2025-08-18  2020-11-25            1
2  2021-06-13  2021-01-10            1
3  2021-08-31  2022-07-20            0
4  2023-08-14  2022-07-01            1


In [7]:

# Define features (X,y)
X = df[['Price', 'Amount', 'DiscountOffered', 'RecommendedProfitMargin']]
y = df['LatePayment']


# Inspect the features
print(X.head())


   Price   Amount  DiscountOffered  RecommendedProfitMargin
0    3.0  4133.94           488.75                     45.0
1    2.5  3941.63            88.94                     40.0
2    1.8  3081.99           144.04                     45.0
3   22.0   820.28            71.79                     45.0
4    7.0  1836.40           170.50                     45.0


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [13]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize the XGBoost model
model = xgb.XGBClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


Confusion Matrix:
 [[30 16]
 [14 56]]

Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.65      0.67        46
           1       0.78      0.80      0.79        70

    accuracy                           0.74       116
   macro avg       0.73      0.73      0.73       116
weighted avg       0.74      0.74      0.74       116

Accuracy: 0.7413793103448276


In [20]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Class imbalance handling
scale_pos_weight = 230 / 348  # ≈ 0.66

# Simplified parameter grid
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 1.0],
}

# Base model with scale_pos_weight
xgb_model = xgb.XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight
)

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=20,
    scoring='accuracy',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit search
random_search.fit(X_train, y_train)

# Best model
best_model = random_search.best_estimator_

# Predict
y_pred = best_model.predict(X_test)

# Evaluation
print("Best Parameters:", random_search.best_params_)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'subsample': 1.0, 'n_estimators': 100, 'max_depth': 9, 'learning_rate': 0.2}

Confusion Matrix:
 [[32 14]
 [16 54]]

Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.70      0.68        46
           1       0.79      0.77      0.78        70

    accuracy                           0.74       116
   macro avg       0.73      0.73      0.73       116
weighted avg       0.74      0.74      0.74       116

Accuracy: 0.7413793103448276


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [47]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Prepare the data in DMatrix format (required by xgb.train)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define the parameter grid
params = {
    'max_depth': 6,
    'learning_rate': 0.1,
    'n_estimators': 300,
    'subsample': 1.0,
    'scale_pos_weight': 230 / 348,
    'eval_metric': 'logloss'
}

# Define evaluation set for early stopping
evals = [(dtrain, 'train'), (dtest, 'test')]

# Train the model using xgb.train (with early stopping)
model1 = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=300,
    evals=evals,
    early_stopping_rounds=10,
    verbose_eval=False  # Disable log output
)

# Get the best iteration (early stopping should help stop training early)
best_iteration = model.best_iteration

# Predict using the best iteration with iteration_range
y_pred = model1.predict(dtest, iteration_range=(0, best_iteration))
y_pred = (y_pred > 0.5).astype(int)  # Assuming binary classification

# Evaluation metrics
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

# Print the evaluation metrics
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)
print("Accuracy:", accuracy)


Confusion Matrix:
 [[39  7]
 [20 50]]

Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.85      0.74        46
           1       0.88      0.71      0.79        70

    accuracy                           0.77       116
   macro avg       0.77      0.78      0.77       116
weighted avg       0.79      0.77      0.77       116

Accuracy: 0.7672413793103449


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


In [48]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Prepare the data in DMatrix format (required by xgb.train)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define the parameter grid for different learning rates
learning_rates = [0.01, 0.05, 0.1, 0.2]

# Store results
results = {}

# Train and evaluate the model for each learning rate
for lr in learning_rates:
    # Define the parameter grid
    params = {
        'max_depth': 6,
        'learning_rate': lr,
        'subsample': 1.0,
        'scale_pos_weight': 230 / 348,
        'eval_metric': 'logloss'
    }

    # Define evaluation set for early stopping
    evals = [(dtrain, 'train'), (dtest, 'test')]

    # Train the model using xgb.train (with early stopping)
    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=300,
        evals=evals,
        early_stopping_rounds=10,
        verbose_eval=False  # Disable log output
    )

    # Get the best iteration (early stopping should help stop training early)
    best_iteration = model.best_iteration

    # Predict using the best iteration with iteration_range
    y_pred = model.predict(dtest, iteration_range=(0, best_iteration))
    y_pred = (y_pred > 0.5).astype(int)  # Assuming binary classification

    # Evaluate the model
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    # Store results for this learning rate
    results[lr] = {
        'Confusion Matrix': conf_matrix,
        'Classification Report': class_report,
        'Accuracy': accuracy
    }

# Print results for all learning rates
for lr, result in results.items():
    print(f"\nLearning Rate: {lr}")
    print("Confusion Matrix:\n", result['Confusion Matrix'])
    print("\nClassification Report:\n", result['Classification Report'])
    print("Accuracy:", result['Accuracy'])



Learning Rate: 0.01
Confusion Matrix:
 [[40  6]
 [17 53]]

Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.87      0.78        46
           1       0.90      0.76      0.82        70

    accuracy                           0.80       116
   macro avg       0.80      0.81      0.80       116
weighted avg       0.82      0.80      0.80       116

Accuracy: 0.8017241379310345

Learning Rate: 0.05
Confusion Matrix:
 [[40  6]
 [18 52]]

Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.87      0.77        46
           1       0.90      0.74      0.81        70

    accuracy                           0.79       116
   macro avg       0.79      0.81      0.79       116
weighted avg       0.81      0.79      0.80       116

Accuracy: 0.7931034482758621

Learning Rate: 0.1
Confusion Matrix:
 [[35 11]
 [16 54]]

Classification Report:
               precision    recall  f1

In [56]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Prepare the data in DMatrix format (required by xgb.train)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define the learning rate
learning_rate = 0.01

# Define the parameter grid
params = {
    'max_depth': 6,
    'learning_rate': learning_rate,
    'subsample': 1.0,
    'scale_pos_weight': 230 / 348,
    'eval_metric': 'logloss'
}

# Define evaluation set for early stopping
evals = [(dtrain, 'train'), (dtest, 'test')]

# Train the model using xgb.train (with early stopping)
final_model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=300,
    evals=evals,
    early_stopping_rounds=10,
    verbose_eval=False  # Disable log output
)

# Get the best iteration (early stopping should help stop training early)
best_iteration = model.best_iteration

# Predict using the best iteration with iteration_range
y_pred = final_model.predict(dtest, iteration_range=(0, best_iteration))
y_pred = (y_pred > 0.5).astype(int)  # Assuming binary classification

# Evaluate the model
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

# Print the results
print(f"\nLearning Rate: {learning_rate}")
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)
print("Accuracy:", accuracy)



Learning Rate: 0.01
Confusion Matrix:
 [[40  6]
 [17 53]]

Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.87      0.78        46
           1       0.90      0.76      0.82        70

    accuracy                           0.80       116
   macro avg       0.80      0.81      0.80       116
weighted avg       0.82      0.80      0.80       116

Accuracy: 0.8017241379310345


In [59]:
import pandas as pd

# Assuming X is your feature dataframe and y is the target dataframe

# Get 3 random samples from your own dataset (X)
sample_data_own = X.sample(10)  # Adjust the number of samples as needed

# Display the sample data
print("Sample Data from Own Dataset:")
print(sample_data_own)

# Convert to DMatrix format for XGBoost model prediction
sample_data_own_dmatrix = xgb.DMatrix(sample_data_own)

# Predict on your own data samples
sample_own_predictions = final_model.predict(sample_data_own_dmatrix)

# Convert predictions to binary (0 or 1)
sample_own_predictions_binary = (sample_own_predictions > 0.5).astype(int)

# Print the predictions
print("\nPredictions for Own Dataset Samples:", sample_own_predictions_binary)

# Get the actual labels for the selected samples
sample_labels = y[sample_data_own.index]

# Compare predictions with actual labels and print results
for i, (pred, actual) in enumerate(zip(sample_own_predictions_binary, sample_labels)):
    result = "Correct" if pred == actual else "Wrong"
    print(f"Sample {i+1}: Predicted = {pred}, Actual = {actual} - {result}")


Sample Data from Own Dataset:
     Price   Amount  DiscountOffered  RecommendedProfitMargin
40     3.0   507.49           428.14                     45.0
109   15.0   858.81           420.02                     45.0
510    2.0  3283.41           410.80                     45.0
268    8.0  3916.15           429.72                     45.0
351   60.0  3348.28           132.02                     45.0
76     3.0   323.27           245.11                     45.0
84     8.5  2224.45           403.62                     45.0
3     22.0   820.28            71.79                     45.0
111    3.8  1728.51            13.58                     45.0
566    4.5  2751.92           176.78                     45.0

Predictions for Own Dataset Samples: [1 0 0 1 1 0 1 0 1 0]
Sample 1: Predicted = 1, Actual = 1 - Correct
Sample 2: Predicted = 0, Actual = 0 - Correct
Sample 3: Predicted = 0, Actual = 1 - Wrong
Sample 4: Predicted = 1, Actual = 1 - Correct
Sample 5: Predicted = 1, Actual = 1 - Correct


In [69]:
import pandas as pd
import numpy as np

# Highly varied and random external data samples
external_samples = pd.DataFrame({
    'Price': [7, 9999, 350, 42, 10],
    'Amount': [1, 50000, 320, 8888, 200],
    'DiscountOffered': [0, 999, 8, 123, 77],
    'RecommendedProfitMargin': [90, 0, 25, 5, 60]
})

# Display the sample data
print(external_samples)


   Price  Amount  DiscountOffered  RecommendedProfitMargin
0      7       1                0                       90
1   9999   50000              999                        0
2    350     320                8                       25
3     42    8888              123                        5
4     10     200               77                       60


In [66]:
# Convert to DMatrix format (for XGBoost model)
external_samples_dmatrix = xgb.DMatrix(external_samples)

# Predict on the external data samples
external_predictions = final_model.predict(external_samples_dmatrix)

# Convert the predictions to binary (0 or 1)
external_predictions_binary = (external_predictions > 0.5).astype(int)

# Print the predictions
print("Predictions for external samples:", external_predictions_binary)


Predictions for external samples: [1 0 0 0 0]
