In [7]:
# Group 4 - ML Random Forest Algorithm for PRH4 Prediction for Customer's next Order
# Introduction:
# This script performs data preprocessing and machine learning model training using a sales dataset.
# The main objective is to predict the 'PRH4' category based on features such as sales channel, customer group, customer number, region, and DSO Indicator.
# The script involves the following steps:
# 1. Importing necessary libraries.
# 2. Loading the dataset and cleaning it by removing customers with null values.
# 3. Filtering customers based on specific criteria related to unique orders and PRH4 categories.
# 4. Splitting the data into training and testing sets.
# 5. One-hot encoding the categorical features.
# 6. Training a Random Forest Classifier model.
# 7. Evaluating the model using accuracy, precision, recall, and F1 score.
# 8. Displaying the maximum depth of the trained Random Forest model's trees.

# Summary:
# The script successfully cleans the dataset by removing customers with null values and those who don't meet specific criteria.
# The Random Forest Classifier model is trained and evaluated on the cleaned dataset.
# The model achieves an accuracy of 50.21%, precision of 47.65%, recall of 50.21%, and F1 score of 47.03%.
# The maximum depth of the trees in the Random Forest model is found to be 655, indicating deep and complex trees.
# This performance suggests that while the model captures some patterns in the data, further tuning and feature engineering may be required to improve its predictive power.

In [1]:
#Importing required libraries

import pandas as pd
import glob
import os
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import randint
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV

In [2]:
# Load the dataset
file_path = '/Client Data/MergedCSV/MergedData.csv'
merged_df = pd.read_csv(file_path)

# Identify customers with any NULL values in any column
customers_with_nulls = merged_df[merged_df.isnull().any(axis=1)]['customer_number'].unique()

# Remove all records of these customers from the dataset
cleaned_df = merged_df[~merged_df['customer_number'].isin(customers_with_nulls)]

# Calculate the unique order count for each customer
unique_orders_per_customer = cleaned_df.groupby('customer_number')['order_number'].nunique()

# Identify customers who have placed more than 3 unique orders
customers_with_more_than_3_orders = unique_orders_per_customer[unique_orders_per_customer > 2].index

# Filter the cleaned_df to keep only those customers
filtered_cleaned_df = cleaned_df[cleaned_df['customer_number'].isin(customers_with_more_than_3_orders)]

# Calculate the number of unique PRH4 per order
unique_prh4_per_order = filtered_cleaned_df.groupby(['order_number', 'customer_number'])['PRH4'].nunique()

# Identify customers who have ordered more than 7 unique PRH4 in the same order
customers_with_more_than_7_prh4 = unique_prh4_per_order[unique_prh4_per_order >= 7].index.get_level_values('customer_number').unique()

# Remove all records of these customers from the filtered_cleaned_df
final_cleaned_df = filtered_cleaned_df[~filtered_cleaned_df['customer_number'].isin(customers_with_more_than_7_prh4)]

# Display the number of records in the final cleaned dataset
num_records_final_cleaned = final_cleaned_df.shape[0]
print(f'The number of records in the final_cleaned_df DataFrame: {num_records_final_cleaned}')

# Check the number of unique customers in the final cleaned dataframe
unique_customers_final_cleaned = final_cleaned_df['customer_number'].nunique()
print(f'The number of unique customers in the final_cleaned_df DataFrame: {unique_customers_final_cleaned}')


  merged_df = pd.read_csv(file_path)


The number of records in the final_cleaned_df DataFrame: 94995
The number of unique customers in the final_cleaned_df DataFrame: 5003


In [3]:
df = final_cleaned_df

# Define the feature columns and the target column
feature_columns = ['sales_channel', 'customer_group', 'customer_number', 'region', 'DSO_Ind']
target_column = 'PRH4'

# Separate the features and the target variable
X = df[feature_columns]
y = df[target_column]

# One-hot encode the categorical variables
column_transformer = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(), feature_columns)
    ],
    remainder='passthrough'
)
X = column_transformer.fit_transform(X)


# Total number of records
total_records = len(df)

# Calculate the split index
split_index = int(total_records * 0.8)


X_train = X[:split_index]
X_test = X[split_index:]
y_train = y[:split_index]
y_test = y[split_index:]

# Display the shapes of the resulting datasets
X_train.shape, X_test.shape, y_train.shape, y_test.shape


# Initialize the model
model = RandomForestClassifier(random_state=42)

In [4]:
# Initialize the model with specific hyperparameters
model = RandomForestClassifier(
    n_estimators=100,  # Number of trees in the forest
    max_depth=None,    # Maximum depth of the tree
    min_samples_split=2,  # Minimum number of samples required to split an internal node
    min_samples_leaf=1,   # Minimum number of samples required to be at a leaf node
    random_state=42      # Seed used by the random number generator
)


# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
report = classification_report(y_test, y_pred)

# Display the evaluation metrics and hyperparameters
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print('Classification Report:')
print(report)

# Display the hyperparameters
print('Hyperparameters:')
print(f'n_estimators: {model.n_estimators}')
print(f'max_depth: {model.max_depth}')
print(f'min_samples_split: {model.min_samples_split}')
print(f'min_samples_leaf: {model.min_samples_leaf}')
print(f'random_state: {model.random_state}')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.502131691141639
Precision: 0.47654466658314026
Recall: 0.502131691141639
F1 Score: 0.4703046100633046
Classification Report:
                                    precision    recall  f1-score   support

           Abutments, Customizable       0.00      0.00      0.00         5
             Abutments, Edentulous       0.36      0.18      0.24       432
                    Abutments, SRA       0.35      0.09      0.14       825
                Abutments, Ti Base       0.63      0.43      0.51       492
  Abutments, single tooth + bridge       0.54      0.24      0.33       207
        Allogenic Bone Substitutes       0.29      0.33      0.31       271
      Allogenic Soft Tissue Grafts       0.00      0.00      0.00        13
               BL Healing Surgical       0.39      0.20      0.27      2179
     BLAT Ti Implants, hydrophilic       0.57      0.77      0.66      1684
        BLAT Ti Implants, standard       0.63      0.64      0.64      1574
   BLAT TiZr Implants, hyd

  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
# Find the max depth of the trees in the forest
max_depths = [estimator.tree_.max_depth for estimator in model.estimators_]
max_depth = max(max_depths)

# Display the max depth
print(f'Max Depth of the Random Forest model: {max_depth}')

Max Depth of the Random Forest model: 655
