In [None]:
import pandas as pd
# Reading a CSV file
data = pd.read_csv('/content/incom2024_delay_example_dataset.csv')

# Displaying the first few rows of the dataframe
print(data.head())

from sklearn.preprocessing import LabelEncoder
# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the "cities" feature in the dataset
data['cities_encoded'] = label_encoder.fit_transform(data['customer_city'])

  payment_type  profit_per_order  sales_per_customer  category_id  \
0        DEBIT         34.448338            92.49099          9.0   
1     TRANSFER         91.193540           181.99008         48.0   
2        DEBIT          8.313806            89.96643         46.0   
3     TRANSFER        -89.463196            99.15065         17.0   
4        DEBIT         44.722590           170.97824         48.0   

          category_name customer_city customer_country  customer_id  \
0      Cardio Equipment        Caguas      Puerto Rico   12097.6830   
1          Water Sports   Albuquerque          EE. UU.    5108.1045   
2  Indoor/Outdoor Games      Amarillo      Puerto Rico    4293.4478   
3                Cleats        Caguas      Puerto Rico     546.5306   
4          Water Sports       Peabody          EE. UU.    1546.3980   

  customer_segment customer_state  ...     order_region  \
0         Consumer             PR  ...   Western Europe   
1         Consumer             CA  ...  

In [None]:
# Display the column names of the dataset
column_names = data.columns
print(column_names)


Index(['payment_type', 'profit_per_order', 'sales_per_customer', 'category_id',
       'category_name', 'customer_city', 'customer_country', 'customer_id',
       'customer_segment', 'customer_state', 'customer_zipcode',
       'department_id', 'department_name', 'latitude', 'longitude', 'market',
       'order_city', 'order_country', 'order_customer_id', 'order_date',
       'order_id', 'order_item_cardprod_id', 'order_item_discount',
       'order_item_discount_rate', 'order_item_id', 'order_item_product_price',
       'order_item_profit_ratio', 'order_item_quantity', 'sales',
       'order_item_total_amount', 'order_profit_per_order', 'order_region',
       'order_state', 'order_status', 'product_card_id', 'product_category_id',
       'product_name', 'product_price', 'shipping_date', 'shipping_mode',
       'label', 'cities_encoded'],
      dtype='object')


In [None]:
# Create a copy of the dataset to avoid modifying the original one
data_encoded = data.copy()

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Loop through each column in the dataset
for column in data_encoded.columns:
    if data_encoded[column].dtype == 'object':  # Check if the column is of object type (categorical)
        # Apply label encoding to the categorical column
        data_encoded[column] = label_encoder.fit_transform(data_encoded[column])

# Display the first few rows of the dataframe with encoded columns
print(data_encoded.head())
max(data_encoded['order_country'])
# Python code example
correlation_matrix = data_encoded.corr()
print(correlation_matrix)

# Python: Correlation with a specific column (label)
label_correlation = correlation_matrix["label"]
print(label_correlation)


payment_type                0.002731
profit_per_order           -0.007416
sales_per_customer         -0.009056
category_id                -0.006400
category_name               0.005959
customer_city               0.004300
customer_country           -0.005809
customer_id                -0.017567
customer_segment           -0.005757
customer_state             -0.008441
customer_zipcode            0.001178
department_id              -0.009762
department_name             0.007892
latitude                    0.009188
longitude                  -0.002473
market                      0.002891
order_city                 -0.015010
order_country              -0.000386
order_customer_id          -0.015388
order_date                  0.003990
order_id                    0.002403
order_item_cardprod_id     -0.007952
order_item_discount        -0.003425
order_item_discount_rate    0.014709
order_item_id               0.002233
order_item_product_price   -0.013844
order_item_profit_ratio    -0.009229
o

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# **1. Load and Clean Data**

# Load the data
data = pd.read_csv("/content/incom2024_delay_example_dataset.csv")

# **2. Adjust Data Types**
# Convert date columns to datetime objects
data['order_date'] = pd.to_datetime(data['order_date'])
data['shipping_date'] = pd.to_datetime(data['shipping_date'])

# **3. Handle Missing Values**
# For simplicity, we'll fill missing values in 'customer_state' and 'customer_zipcode' with "Unknown"
# You might need a more sophisticated strategy for real-world scenarios
data['customer_state'].fillna("Unknown", inplace=True)
data['customer_zipcode'].fillna("Unknown", inplace=True)

# **4. Check for Imbalance**

# Check the distribution of the 'label'
print(data['label'].value_counts())

# **5. Feature Reduction and Engineering**

# **5a. Select Relevant Features**
# Based on domain knowledge, select features likely to be related to shipping delays.
# This is where domain expertise is key! For now, let's make some assumptions:
features = ['payment_type', 'category_name', 'customer_city', 'customer_state',
            'customer_country', 'department_name', 'market', 'order_region', 'order_country',
            'order_state', 'shipping_mode']

# **5b. Encoding Categorical Features**

# One-Hot Encoding: (use for features with no inherent order)
data = pd.get_dummies(data, columns=['payment_type', 'category_name', 'customer_city',
                                   'customer_country', 'department_name', 'market',
                                   'order_region', 'order_country', 'order_state', 'shipping_mode'])

# **5c. Feature Scaling**
# Standardize numerical features
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])

# **6. Principal Component Analysis (PCA)**
# Apply PCA to further reduce dimensions
pca = PCA(n_components=0.95) # Keep components explaining 95% of variance
pca_features = pca.fit_transform(data[features])
data['pca_features'] = list(pca_features)

# **7. Prepare Data for Modeling**
X = data['pca_features'].values.reshape(-1, pca_features.shape[1])
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **8. Model Training and Evaluation**

# **8a. Train a Random Forest Classifier**
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# **8b. Make Predictions**
y_pred = model.predict(X_test)

# **9. Evaluate Model Performance**

# **9a. F1-Score**
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted average for imbalanced data
print(f"F1-Score: {f1}")

# **9b. Confusion Matrix**
cm = confusion_matrix(y_test, y_pred)

# Visualize the Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Early', 'On Time', 'Delayed'],
            yticklabels=['Early', 'On Time', 'Delayed'])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

# **10. Feature Importance (Optional)**
# Get feature importances from the Random Forest model
feature_importances = model.feature_importances_

# Visualize feature importances
plt.figure(figsize=(10, 6))
plt.barh(range(len(features)), feature_importances, align='center')
plt.yticks(range(len(features)), features)
plt.xlabel("Feature Importance")
plt.title("Random Forest Feature Importance")
plt.show()

# **Iterations and Improvements**
# You would now iterate, trying different:
#  - Feature sets
#  - Encoding techniques
#  - Feature scaling methods
#  - Hyperparameters of your Random Forest model
#  - Imbalance handling techniques (e.g., SMOTE, oversampling, undersampling)
#  - And, potentially, different classification algorithms

  data['order_date'] = pd.to_datetime(data['order_date'])
  data['shipping_date'] = pd.to_datetime(data['shipping_date'])


label
 1    8976
-1    3545
 0    3028
Name: count, dtype: int64


KeyError: "['payment_type', 'category_name', 'customer_city', 'customer_country', 'department_name', 'market', 'order_region', 'order_country', 'order_state', 'shipping_mode'] not in index"

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA
from imblearn.over_sampling import RandomOverSampler

# 1. Load the data
df = pd.read_csv('/content/incom2024_delay_example_dataset.csv')  # Replace with your actual file name

# 2. Data Cleaning
# Convert date columns to datetime
date_columns = ['order_date', 'shipping_date']
for col in date_columns:
    df[col] = pd.to_datetime(df[col])

# Handle missing values
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object']).columns

# Impute missing values
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

df[numeric_features] = numeric_imputer.fit_transform(df[numeric_features])
df[categorical_features] = categorical_imputer.fit_transform(df[categorical_features])

# 3. Check for imbalance
print(df['label'].value_counts(normalize=True))

# 4. Feature Engineering
# Select relevant features
relevant_features = [
    'payment_type', 'profit_per_order', 'sales_per_customer', 'category_id',
    'customer_segment', 'latitude', 'longitude', 'market', 'order_item_discount_rate',
    'order_item_profit_ratio', 'order_item_quantity', 'sales', 'shipping_mode'
]

X = df[relevant_features]
y = df['label']

# Encode categorical variables
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns

# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 5. Apply PCA
pca = PCA(n_components=0.95)  # Retain 95% of variance

# 6. Create a pipeline with preprocessing, PCA, and Random Forest
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', pca),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# 7. Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA
from imblearn.over_sampling import RandomOverSampler

# Steps 1-7 remain the same as in the previous code...

# 8. Handle imbalanced data using RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# 9. Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# 10. Define the parameter grid for GridSearchCV
param_grid = {
    'pca__n_components': [0.85, 0.9, 0.95],
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# 11. Perform GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_resampled, y_train_resampled)

# 12. Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# 13. Use the best model to make predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# 14. Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# 15. Feature importance
feature_importance = best_model.named_steps['classifier'].feature_importances_
feature_names = best_model.named_steps['preprocessor'].get_feature_names_out()

# Create a dataframe of feature importances
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
})
feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)
print(feature_importance_df.head(10))  # Print top 10 most important features

  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])


label
 1.0    0.577272
-1.0    0.227989
 0.0    0.194739
Name: proportion, dtype: float64
Fitting 5 folds for each of 324 candidates, totalling 1620 fits


KeyboardInterrupt: 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import RandomOverSampler

# Load the dataset
data= pd.read_csv('/content/incom2024_delay_example_dataset.csv')  # Replaace with your actual dataset path

# Data Cleaning
## 1. Adjust columns data types
data['order_date'] = pd.to_datetime(data['order_date'])
data['shipping_date'] = pd.to_datetime(data['shipping_date'])

## 2. Handling missing values
data.fillna(method='ffill', inplace=True)  # Forward fill for simplicity

## 3. Check for imbalance
print(data['label'].value_counts())

# Balancing the dataset
ros = RandomOverSampler(random_state=42)
X = data.drop('label', axis=1)  # Features
y = data['label']  # Target variable
X_resampled, y_resampled = ros.fit_resample(X, y)

# Iteration 1
## 4. Feature reduction and encoding
# Convert categorical features to numerical using OneHotEncoding
categorical_features = ['payment_type', 'category_name', 'customer_city', 'customer_country', 'customer_segment', 'customer_state', 'department_name', 'order_city', 'order_country', 'product_name', 'shipping_mode']
encoder = OneHotEncoder(sparse=False)
X_encoded = encoder.fit_transform(X_resampled[categorical_features])
X_resampled = pd.concat([X_resampled.drop(categorical_features, axis=1).reset_index(drop=True), pd.DataFrame(X_encoded)], axis=1)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_pca = pca.fit_transform(X_resampled)

## 5. Apply the algorithm
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_resampled, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

## 6. Evaluation parameters
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Repeat steps 4-7 until best results are achieved


  data['order_date'] = pd.to_datetime(data['order_date'])
  data['shipping_date'] = pd.to_datetime(data['shipping_date'])
  data.fillna(method='ffill', inplace=True)  # Forward fill for simplicity


label
 1    8976
-1    3545
 0    3028
Name: count, dtype: int64




TypeError: Feature names are only supported if all input features have string names, but your input has ['int', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA
from imblearn.over_sampling import RandomOverSampler
from scipy.stats import randint, uniform

# 1. Load the data
df= pd.read_csv('/content/incom2024_delay_example_dataset.csv')  # Replaace with your actual dataset path

# 2. Data Cleaning
# Convert date columns to datetime
date_columns = ['order_date', 'shipping_date']
for col in date_columns:
    df[col] = pd.to_datetime(df[col])

# Handle missing values
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object']).columns

# Impute missing values
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

df[numeric_features] = numeric_imputer.fit_transform(df[numeric_features])
df[categorical_features] = categorical_imputer.fit_transform(df[categorical_features])

# 3. Check for imbalance
print("Class distribution:")
print(df['label'].value_counts(normalize=True))

# 4. Feature Engineering
# Select relevant features
relevant_features = [
    'payment_type', 'profit_per_order', 'sales_per_customer', 'category_id',
    'customer_segment', 'latitude', 'longitude', 'market', 'order_item_discount_rate',
    'order_item_profit_ratio', 'order_item_quantity', 'sales', 'shipping_mode'
]

X = df[relevant_features]
y = df['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle imbalanced data using RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA()),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Use a smaller subset of data for tuning
X_sample, _, y_sample, _ = train_test_split(X_train_resampled, y_train_resampled,
                                            train_size=0.3, random_state=42)

# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'pca__n_components': uniform(0.8, 0.15),
    'classifier__n_estimators': randint(50, 300),
    'classifier__max_depth': randint(10, 50),
    'classifier__min_samples_split': randint(2, 11),
    'classifier__min_samples_leaf': randint(1, 5)
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist,
                                   n_iter=50, cv=3, n_jobs=-1, verbose=2, random_state=42)

print("Starting hyperparameter tuning...")
random_search.fit(X_sample, y_sample)

# Print the best parameters and score
print("Best parameters:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)

# Use the best model to make predictions
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate the model
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importance = best_model.named_steps['classifier'].feature_importances_
feature_names = best_model.named_steps['preprocessor'].get_feature_names_out()

# Create a dataframe of feature importances
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
})
feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)
print("\nTop 10 Most Important Features:")
print(feature_importance_df.head(10))

  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])


Class distribution:
label
 1.0    0.577272
-1.0    0.227989
 0.0    0.194739
Name: proportion, dtype: float64
Starting hyperparameter tuning...
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters: {'classifier__max_depth': 49, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 3, 'classifier__n_estimators': 160, 'pca__n_components': 0.9243106263727894}
Best cross-validation score: 0.5172414040494916

Confusion Matrix:
[[203 190 316]
 [179 155 277]
 [541 496 753]]

Classification Report:
              precision    recall  f1-score   support

        -1.0       0.22      0.29      0.25       709
         0.0       0.18      0.25      0.21       611
         1.0       0.56      0.42      0.48      1790

    accuracy                           0.36      3110
   macro avg       0.32      0.32      0.31      3110
weighted avg       0.41      0.36      0.38      3110



ValueError: All arrays must be of the same length

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier # Using XGBoost
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import GridSearchCV

# Load the dataset
df = pd.read_csv('/content/incom2024_delay_example_dataset.csv')


# Check for missing values and handle them (numerical only)
for col in df.select_dtypes(include=['number']):
    df[col].fillna(df[col].mean(), inplace=True)

# Define features
categorical_features = ['payment_type', 'category_name', 'customer_segment', 'customer_state',
                       'department_name', 'market', 'order_region', 'order_state',
                       'order_status', 'shipping_mode', 'customer_city', 'customer_country', 'order_city', 'order_country']
numerical_features = ['profit_per_order', 'sales_per_customer', 'category_id',
                   'customer_id', 'customer_zipcode', 'department_id', 'latitude', 'longitude',
                   'order_customer_id', 'order_id', 'order_item_cardprod_id', 'order_item_discount',
                   'order_item_discount_rate', 'order_item_id',
                   'order_item_product_price', 'order_item_profit_ratio',
                   'order_item_quantity', 'sales', 'order_item_total_amount',
                   'order_profit_per_order', 'product_card_id',
                   'product_category_id', 'product_price']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Split data into features (X) and labels (y)
X = df.drop(['order_date', 'shipping_date', 'label', 'product_name'], axis=1)
y = df['label']

# Apply preprocessor BEFORE SMOTE
X = preprocessor.fit_transform(X)

# Handle Data Imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled) # Stratified split

# XGBoost Model and Hyperparameter Tuning
model = XGBClassifier(random_state=42, objective='multi:softmax', num_class=3)  # Multiclass classification

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

# Best Model and Evaluation
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro')
print(f"F1 Score: {f1}")
print(classification_report(y_test, y_pred))

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

ValueError: 
All the 135 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/usr/local/lib/python3.10/dist-packages/xgboost/sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got [-1  0  1]


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import GridSearchCV

# Load the dataset
df = pd.read_csv('/content/incom2024_delay_example_dataset.csv')

# Data Cleaning and Consistency

# Check for missing values and handle them (numerical only)
for col in df.select_dtypes(include=['number']):
    df[col].fillna(df[col].mean(), inplace=True)

# Define features
categorical_features = ['payment_type', 'category_name', 'customer_segment', 'customer_state',
                       'department_name', 'market', 'order_region', 'order_state',
                       'order_status', 'shipping_mode', 'customer_city', 'customer_country', 'order_city', 'order_country']
numerical_features = ['profit_per_order', 'sales_per_customer', 'category_id',
                   'customer_id', 'customer_zipcode', 'department_id', 'latitude', 'longitude',
                   'order_customer_id', 'order_id', 'order_item_cardprod_id', 'order_item_discount',
                   'order_item_discount_rate', 'order_item_id',
                   'order_item_product_price', 'order_item_profit_ratio',
                   'order_item_quantity', 'sales', 'order_item_total_amount',
                   'order_profit_per_order', 'product_card_id',
                   'product_category_id', 'product_price']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Split data into features (X) and labels (y)
X = df.drop(['order_date', 'shipping_date', 'label', 'product_name'], axis=1)
y = df['label']

# Apply preprocessor BEFORE SMOTE
X = preprocessor.fit_transform(X)

# Encode labels with LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Handle Data Imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split data into train and test sets (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# XGBoost Model and Hyperparameter Tuning
model = XGBClassifier(random_state=42, objective='multi:softmax', num_class=3)

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

# Best Model and Evaluation
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro')
print(f"F1 Score: {f1}")
print(classification_report(y_test, y_pred))

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

F1 Score: 0.7299440386286976
              precision    recall  f1-score   support

           0       0.69      0.81      0.74      1795
           1       0.90      0.61      0.73      1795
           2       0.68      0.77      0.72      1796

    accuracy                           0.73      5386
   macro avg       0.75      0.73      0.73      5386
weighted avg       0.75      0.73      0.73      5386

Best Hyperparameters: {'learning_rate': 0.3, 'max_depth': 7, 'n_estimators': 300}


In [None]:
pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.12.3-py3-none-any.whl.metadata (8.3 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.3-py3-none-any.whl (258 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.3/258.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.3 imblearn-0.0


In [None]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.22.3-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading xgboost-2.1.1-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.22.3-py3-none-manylinux2014_x86_64.whl (190.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.22.3 xgboost-2.1.1


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# Load the dataset
df = pd.read_csv('/content/incom2024_delay_example_dataset.csv')

summary =df.describe()
summary
# Data Cleaning and Consistency (Numerical only)
for col in df.select_dtypes(include=['number']):
    df[col].fillna(df[col].mean(), inplace=True)

# Define features
categorical_features = ['payment_type', 'category_name', 'customer_segment', 'customer_state',
                       'department_name', 'market', 'order_region', 'order_state',
                       'order_status', 'shipping_mode', 'customer_city', 'customer_country', 'order_city', 'order_country']
numerical_features = ['profit_per_order', 'sales_per_customer', 'category_id',
                   'customer_id', 'customer_zipcode', 'department_id', 'latitude', 'longitude',
                   'order_customer_id', 'order_id', 'order_item_cardprod_id', 'order_item_discount',
                   'order_item_discount_rate', 'order_item_id',
                   'order_item_product_price', 'order_item_profit_ratio',
                   'order_item_quantity', 'sales', 'order_item_total_amount',
                   'order_profit_per_order', 'product_card_id',
                   'product_category_id', 'product_price']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Split data into features (X) and labels (y)
X = df.drop(['order_date', 'shipping_date', 'label', 'product_name'], axis=1)
y = df['label']

# Apply preprocessor BEFORE SMOTE
X = preprocessor.fit_transform(X)

# Encode labels with LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Handle Data Imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split data into train and test sets (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)



# Random Forest
model_rf = RandomForestClassifier(random_state=42, class_weight='balanced')
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

#  --- Hyperparameter Tuning ---
model = model_rf
param_grid = param_grid_rf

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

# Best Model and Evaluation
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro')
print(f"F1 Score: {f1}")
print(classification_report(y_test, y_pred))

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)
#.81 f1 score

KeyboardInterrupt: 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import GridSearchCV

# Load the dataset
df = pd.read_csv('/content/incom2024_delay_example_dataset.csv')

# Data Cleaning and Consistency (Numerical only)
for col in df.select_dtypes(include=['number']):
    df[col].fillna(df[col].mean(), inplace=True)

# Define features
categorical_features = ['payment_type', 'category_name', 'customer_segment', 'customer_state',
                       'department_name', 'market', 'order_region', 'order_state',
                       'order_status', 'shipping_mode', 'customer_city', 'customer_country', 'order_city', 'order_country']
numerical_features = ['profit_per_order', 'sales_per_customer', 'category_id',
                   'customer_id', 'customer_zipcode', 'department_id', 'latitude', 'longitude',
                   'order_customer_id', 'order_id', 'order_item_cardprod_id', 'order_item_discount',
                   'order_item_discount_rate', 'order_item_id',
                   'order_item_product_price', 'order_item_profit_ratio',
                   'order_item_quantity', 'sales', 'order_item_total_amount',
                   'order_profit_per_order', 'product_card_id',
                   'product_category_id', 'product_price']

# Preprocessing pipeline
#hand
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Split data into features (X) and labels (y)
#since the date values are inconsistent we drop them
X = df.drop(['order_date', 'shipping_date', 'label', 'product_name'], axis=1)
y = df['label']

# Encode labels with LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split data into train and test sets (stratified) BEFORE SMOTE
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Apply preprocessor to training and testing data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Handle Data Imbalance with SMOTE ONLY on TRAINING data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Random Forest
model_rf = RandomForestClassifier(random_state=42, class_weight='balanced')
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Hyperparameter Tuning
grid_search = GridSearchCV(model_rf, param_grid_rf, cv=5, scoring='f1_macro')
grid_search.fit(X_train_resampled, y_train_resampled)

# Best Model and Evaluation
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro')
print(f"F1 Score: {f1}")
print(classification_report(y_test, y_pred))

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

KeyError: "['order_date', 'shipping_date', 'label', 'product_name'] not found in axis"

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import GridSearchCV

# Load the dataset
df = pd.read_csv('/content/incom2024_delay_example_dataset.csv')

# Data Cleaning and Consistency (Numerical only)
for col in df.select_dtypes(include=['number']):
    df[col].fillna(df[col].mean(), inplace=True)

# Define features
categorical_features = ['payment_type', 'category_name', 'customer_segment', 'customer_state',
                       'department_name', 'market', 'order_region', 'order_state',
                       'order_status', 'shipping_mode', 'customer_city', 'customer_country', 'order_city', 'order_country']
numerical_features = ['profit_per_order', 'sales_per_customer', 'category_id',
                   'customer_id', 'customer_zipcode', 'department_id', 'latitude', 'longitude',
                   'order_customer_id', 'order_id', 'order_item_cardprod_id', 'order_item_discount',
                   'order_item_discount_rate', 'order_item_id',
                   'order_item_product_price', 'order_item_profit_ratio',
                   'order_item_quantity', 'sales', 'order_item_total_amount',
                   'order_profit_per_order', 'product_card_id',
                   'product_category_id', 'product_price']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Split data into features (X) and labels (y)
X = df.drop(['order_date', 'shipping_date', 'label', 'product_name'], axis=1)
y = df['label']

# Encode labels with LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split data into train and test sets (stratified) BEFORE SMOTE
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Apply preprocessor to training and testing data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Handle Data Imbalance with SMOTE ONLY on TRAINING data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Random Forest
model_rf = RandomForestClassifier(random_state=42, class_weight='balanced')
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Hyperparameter Tuning
grid_search = GridSearchCV(model_rf, param_grid_rf, cv=5, scoring='f1_macro')
grid_search.fit(X_train_resampled, y_train_resampled)

# Best Model and Evaluation
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro')
print(f"F1 Score: {f1}")
print(classification_report(y_test, y_pred))

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

F1 Score: 0.3616513112207786
              precision    recall  f1-score   support

           0       0.38      0.30      0.33       709
           1       0.22      0.02      0.03       606
           2       0.62      0.86      0.72      1795

    accuracy                           0.57      3110
   macro avg       0.40      0.39      0.36      3110
weighted avg       0.48      0.57      0.50      3110

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
