In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import pickle

# Function to load data
def load_data(filepath):
    return pd.read_excel(filepath)

# Function to preprocess data
def preprocess_data(df):
    # Handle missing values
    df['material_ref'] = df['material_ref'].replace('00000', np.nan)
    df['material_ref'] = df['material_ref'].fillna(df['material_ref'].mode()[0])
    df['quantity tons'] = pd.to_numeric(df['quantity tons'], errors='coerce')
    df['thickness'] = pd.to_numeric(df['thickness'], errors='coerce')
    df = df.dropna(subset=['quantity tons', 'thickness'])

    # Feature engineering
    df.loc[:, 'new_feature'] = df['quantity tons'] * df['thickness']

    # Encoding categorical variables
    categorical_cols = ['country', 'item type', 'application']
    if all(col in df.columns for col in categorical_cols):
        df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

    # Encode status for classification
    if 'status' in df.columns:
        df = df[df['status'].str.upper().isin(['WON', 'LOST'])]
        le = LabelEncoder()
        df['status'] = le.fit_transform(df['status'])  # WON -> 1, LOST -> 0

    # Ensure only numeric columns are used
    df = df.select_dtypes(include=[np.number])
    
    return df

# Function to train classification model
def train_classification_model(X_train, y_train):
    classifier = RandomForestClassifier(random_state=42)
    classifier.fit(X_train, y_train)
    return classifier

# Load dataset
file_path = "Copper_Set.xlsx"  # Update the path
data = load_data(file_path)

# Preprocess data
classification_data = preprocess_data(data)

# Split data into features and target
X = classification_data.drop(['status'], axis=1)
y = classification_data['status']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
classification_model = train_classification_model(X_train, y_train)

# Evaluate the model
y_pred = classification_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, classification_model.predict_proba(X_test)[:, 1]))

# Save the model
with open("classification_model.pkl", "wb") as file:
    pickle.dump(classification_model, file)

print("Classification model saved as 'classification_model.pkl'")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'new_feature'] = df['quantity tons'] * df['thickness']


Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.85      0.89      6952
           1       0.96      0.98      0.97     23138

    accuracy                           0.95     30090
   macro avg       0.94      0.92      0.93     30090
weighted avg       0.95      0.95      0.95     30090

ROC-AUC Score: 0.9867526218085494
Classification model saved as 'classification_model.pkl'


In [4]:
import pickle
import pandas as pd

# Load the classification model
classification_model_path = "classification_model.pkl"
with open(classification_model_path, 'rb') as file:
    classification_model = pickle.load(file)

# Extract feature names from the model
feature_names = classification_model.feature_names_in_

# Generate dummy input data
COUNTRY_OPTIONS = [26, 27, 28, 30, 32, 38, 39, 40, 77, 78, 79, 80, 84, 89, 107, 113]
ITEM_TYPE_OPTIONS = ['Others', 'PL', 'S', 'SLAWR', 'W', 'WI']
APPLICATION_OPTIONS = [3, 4, 5, 10, 15, 19, 20, 22, 25, 26, 27, 28, 29, 38, 39, 40, 
                       41, 42, 56, 58, 59, 65, 66, 67, 68, 69, 70, 79, 99]

# Test various combinations to identify "LOST" outputs
results = []
for country in COUNTRY_OPTIONS:
    for item_type in ITEM_TYPE_OPTIONS:
        for application in APPLICATION_OPTIONS:
            # Create input data
            input_data = pd.DataFrame({
                'country': [country],
                'item type': [item_type],
                'application': [application]
            })
            # Dummy encoding and add missing columns
            input_data_encoded = pd.get_dummies(input_data)
            for col in feature_names:
                if col not in input_data_encoded:
                    input_data_encoded[col] = 0  # Add missing columns
            
            # Predict using the model
            prediction = classification_model.predict(input_data_encoded[feature_names])
            if prediction[0] == 0:  # 0 indicates "LOST"
                results.append((country, item_type, application))

# Return the combinations that result in "LOST"
print(results)



[]


In [5]:
import pandas as pd

# Load dataset
file_path = "Copper_Set.xlsx"  # Update this path to the correct location of your dataset
data = pd.read_excel(file_path)

# Check for "LOST" in the 'status' column
lost_count = data['status'].str.upper().value_counts().get("LOST", 0)
print(f"Number of 'LOST' entries: {lost_count}")


Number of 'LOST' entries: 34438


In [6]:
# Check the balance of "WON" and "LOST"
class_balance = data['status'].str.upper().value_counts()
print("Class Balance:")
print(class_balance)


Class Balance:
status
WON                116010
LOST                34438
NOT LOST FOR AM     19573
REVISED              4276
TO BE APPROVED       4170
DRAFT                3140
OFFERED                53
OFFERABLE              10
WONDERFUL               1
Name: count, dtype: int64


In [7]:
# Filter for relevant statuses: WON and LOST
filtered_data = data[data['status'].str.upper().isin(['WON', 'LOST'])]

# Check the new balance
print("Filtered Class Balance:")
print(filtered_data['status'].str.upper().value_counts())


Filtered Class Balance:
status
WON     116010
LOST     34438
Name: count, dtype: int64


In [8]:
# Separate WON and LOST data
won = filtered_data[filtered_data['status'].str.upper() == "WON"]
lost = filtered_data[filtered_data['status'].str.upper() == "LOST"]

# Balance WON to match LOST
won_sampled = won.sample(n=lost.shape[0], random_state=42)

# Combine and shuffle the dataset
balanced_data = pd.concat([won_sampled, lost]).sample(frac=1, random_state=42)

# Check the balanced class distribution
print("Balanced Class Distribution:")
print(balanced_data['status'].str.upper().value_counts())


Balanced Class Distribution:
status
LOST    34438
WON     34438
Name: count, dtype: int64


In [12]:
from sklearn.model_selection import train_test_split

# Function to split data into training and testing sets
def split_data(df, target):
    """
    Splits the DataFrame into features (X) and target (y) and then into training and testing sets.

    Args:
    df (DataFrame): The preprocessed dataset.
    target (str): The target column name.

    Returns:
    X_train, X_test, y_train, y_test: Training and testing datasets.
    """
    X = df.drop([target], axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test


# Preprocess the filtered and balanced data
classification_data = preprocess_data(balanced_data)

# Split data
X_train, X_test, y_train, y_test = split_data(classification_data, target='status')

# Train the classification model
classification_model = train_classification_model(X_train, y_train)

# Evaluate the model
y_pred = classification_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      6949
           1       0.95      0.90      0.92      6827

    accuracy                           0.93     13776
   macro avg       0.93      0.93      0.93     13776
weighted avg       0.93      0.93      0.93     13776



In [13]:
print("Classification Data Info:")
print(classification_data.info())
print("Target Value Counts:")
print(classification_data['status'].value_counts())


Classification Data Info:
<class 'pandas.core.frame.DataFrame'>
Index: 68876 entries, 58501 to 54030
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   item_date      68875 non-null  float64
 1   quantity tons  68876 non-null  float64
 2   customer       68876 non-null  float64
 3   status         68876 non-null  int32  
 4   thickness      68876 non-null  float64
 5   width          68876 non-null  float64
 6   product_ref    68876 non-null  int64  
 7   delivery date  68876 non-null  float64
 8   selling_price  68875 non-null  float64
 9   new_feature    68876 non-null  float64
dtypes: float64(8), int32(1), int64(1)
memory usage: 5.5 MB
None
Target Value Counts:
status
0    34438
1    34438
Name: count, dtype: int64


In [14]:
# Drop unnecessary columns
features_to_drop = ['item_date', 'customer', 'delivery date', 'product_ref']
classification_data = classification_data.drop(columns=features_to_drop)

print("Updated Classification Data Info:")
print(classification_data.info())



Updated Classification Data Info:
<class 'pandas.core.frame.DataFrame'>
Index: 68876 entries, 58501 to 54030
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   quantity tons  68876 non-null  float64
 1   status         68876 non-null  int32  
 2   thickness      68876 non-null  float64
 3   width          68876 non-null  float64
 4   selling_price  68875 non-null  float64
 5   new_feature    68876 non-null  float64
dtypes: float64(5), int32(1)
memory usage: 3.4 MB
None


In [15]:
# Split data
X_train, X_test, y_train, y_test = split_data(classification_data, target='status')

# Train the classification model
classification_model = train_classification_model(X_train, y_train)

# Evaluate the model
y_pred = classification_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.72      0.70      6949
           1       0.70      0.66      0.68      6827

    accuracy                           0.69     13776
   macro avg       0.69      0.69      0.69     13776
weighted avg       0.69      0.69      0.69     13776



In [16]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [17]:
classification_data['thickness_width_ratio'] = classification_data['thickness'] / classification_data['width']
classification_data['quantity_squared'] = classification_data['quantity tons'] ** 2


In [18]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='f1', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print("Best Hyperparameters:", grid_search.best_params_)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Hyperparameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}


In [20]:
from xgboost import XGBClassifier

# Train XGBoost model
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)

# Evaluate the model
y_pred = xgb_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.75      0.71      6949
           1       0.72      0.64      0.68      6827

    accuracy                           0.70     13776
   macro avg       0.70      0.70      0.69     13776
weighted avg       0.70      0.70      0.69     13776



In [21]:
from sklearn.neural_network import MLPClassifier
mlp_model = MLPClassifier(random_state=42)
mlp_model.fit(X_train, y_train)
y_pred = mlp_model.predict(X_test)
print(classification_report(y_test, y_pred))


ValueError: Input X contains NaN.
MLPClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [22]:
print("Missing values in X_train:")
print(X_train.isnull().sum())


Missing values in X_train:
quantity tons    0
thickness        0
width            0
selling_price    1
new_feature      0
dtype: int64


In [23]:
from sklearn.impute import SimpleImputer

# Impute missing values in X_train and X_test
imputer = SimpleImputer(strategy="mean")  # Use "median" if more robust imputation is needed
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Convert back to DataFrame for compatibility
X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns, index=X_train.index)
X_test_imputed = pd.DataFrame(X_test_imputed, columns=X_test.columns, index=X_test.index)


In [24]:
print("Missing values after imputation in X_train:")
print(X_train_imputed.isnull().sum())


Missing values after imputation in X_train:
quantity tons    0
thickness        0
width            0
selling_price    0
new_feature      0
dtype: int64


In [25]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

# Train the neural network
mlp_model = MLPClassifier(random_state=42, max_iter=500)
mlp_model.fit(X_train_imputed, y_train)

# Predict and evaluate
y_pred = mlp_model.predict(X_test_imputed)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.01      0.02      6949
           1       0.50      1.00      0.66      6827

    accuracy                           0.50     13776
   macro avg       0.59      0.50      0.34     13776
weighted avg       0.59      0.50      0.34     13776

