In [None]:
import pandas as pd
import os

# Define file paths
file_path = r'D:\Final_Projects\Copper_Set.csv.csv'
save_path = r'D:\Final_Projects'
output_file = os.path.join(save_path, 'Cleaned_Copper_modelling_project.csv')

# Load the dataset with error handling and specify data types to prevent dtype warnings
try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file at {file_path} was not found.")
    exit()

# Step 1: Initial Inspection
print("Dataset Information:\n", df.info())
print("\nFirst 5 rows:\n", df.head())
print("\nSummary Statistics:\n", df.describe())

# Check unique values in the 'status' column
if 'status' in df.columns:
    unique_status_values = df['status'].unique()
    print("\nUnique values in 'status' column:", unique_status_values)

# Step 2: Handle Non-Numeric Values in Numeric Columns
# Convert non-numeric values to NaN for each numeric column
numeric_columns = ['item_date', 'quantity tons', 'customer', 'country', 'application', 'thickness', 'width', 'delivery date', 'selling_price']
for col in numeric_columns:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Step 3: Handle Missing Values
missing_values = df.isnull().sum()
print("\nMissing values in each column:\n", missing_values)

# Drop columns with a high percentage of missing values
threshold = 0.5  # Adjust threshold as needed
df = df.dropna(axis=1, thresh=int((1 - threshold) * len(df)))

# Impute remaining missing values based on column type
for col in df.columns:
    if df[col].isnull().sum() > 0:  # Only act on columns with missing values
        if df[col].dtype in ['float64', 'int64']:
            df[col].fillna(df[col].median(), inplace=True)
        elif df[col].dtype == 'object':
            df[col].fillna(df[col].mode()[0], inplace=True)

# Step 4: Remove Duplicates
initial_row_count = df.shape[0]
df = df.drop_duplicates()
print(f"\nDuplicates removed: {initial_row_count - df.shape[0]} rows")

# Step 5: Standardize Column Formats
df.columns = df.columns.str.strip().str.lower()

# Convert 'item_date' column to datetime format
if 'item_date' in df.columns:
    df['item_date'] = pd.to_datetime(df['item_date'], format='%d%m%y', errors='coerce')
    print("\nConverted 'item_date' column to datetime format.")

# Step 6: Detect and Handle Negative Values in 'selling_price'
if 'selling_price' in df.columns:
    negative_prices = df[df['selling_price'] < 0]
    if not negative_prices.empty:
        print(f"\nNegative values detected in 'selling_price': {negative_prices.shape[0]} rows")
    df.loc[df['selling_price'] < 0, 'selling_price'] = None
    df['selling_price'].fillna(df['selling_price'].median(), inplace=True)

# Step 7: Outlier Detection and Treatment
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify outliers
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    if not outliers.empty:
        print(f"\nOutliers detected in column '{col}': {outliers.shape[0]} rows")
    
    df[col] = df[col].clip(lower_bound, upper_bound)

# Step 8: Save the Cleaned Dataset
try:
    df.to_csv(output_file, index=False)
    print(f"\nCleaned dataset saved to {output_file}")
except Exception as e:
    print(f"Error saving the file: {e}")


In [None]:
import pandas as pd

# Load the cleaned dataset
file_path = r'D:/Final_Projects/Cleaned_Copper_modelling_project.csv'
df = pd.read_csv(file_path)

# Inspect the first few rows of the dataset
print("First 5 rows of the dataset:")
print(df.head())

# Inspect the column names
print("\nColumn names in the dataset:")
print(df.columns)

# Get basic information about the dataset (data types, non-null counts)
print("\nDataset Information:")
print(df.info())

# Get summary statistics of the numerical columns
print("\nSummary statistics:")
print(df.describe())

# Check for any missing values in each column
print("\nMissing values in each column:")
print(df.isnull().sum())

# Check unique values in categorical columns (like 'status')
print("\nUnique values in 'status' column:")
print(df['status'].unique())


In [None]:
import pandas as pd

# Load the dataset
file_path = r'D:/Final_Projects/Cleaned_Copper_modelling_project.csv'
df = pd.read_csv(file_path)

# Drop the 'item_date' column as it contains all missing values
df = df.drop(columns=['item_date'])

# Handle negative values in 'quantity tons' by removing rows with negative quantities
df = df[df['quantity tons'] >= 0]

# Encode categorical columns 'status' and 'item type' using one-hot encoding
df = pd.get_dummies(df, columns=['status', 'item type'], drop_first=True)

# Check if there are any other missing values
missing_values = df.isnull().sum()
print("Missing values after cleaning:")
print(missing_values)

# Save the cleaned dataset to a new CSV file
new_file_path = r'D:/Final_Projects/New_Cleaned_Copper_modelling_project.csv'
df.to_csv(new_file_path, index=False)

# Check the cleaned dataset
print("\nFirst 5 rows of the cleaned dataset:")
print(df.head())



In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, confusion_matrix
import joblib
import logging

# Set up logging to both console and file
log_file = 'script_logs.log'
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Create a file handler for logging to a file
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)

# Add the file handler to the root logger
logging.getLogger().addHandler(file_handler)

# Create a console handler for logging to the console
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(formatter)

# Add the console handler to the root logger
logging.getLogger().addHandler(console_handler)

# Log the start of the script
logging.info("Starting data preprocessing and model training script.")

# Load the cleaned dataset
file_path = r'D:/Final_Projects/New_Cleaned_Copper_modelling_project.csv'
logging.info(f"Loading dataset from {file_path}...")
df = pd.read_csv(file_path)
logging.info("Dataset loaded successfully.")

# Check for the columns in the dataset
logging.info(f"Columns in dataset: {df.columns}\n")

# Check for skewness
logging.info(f"Skewness in numerical features:\n{df.skew()}\n")

# Visualizing skewness with histograms
logging.info("Visualizing skewness in numerical features...")
df.hist(bins=50, figsize=(20, 15))
plt.show()

# Checking for outliers using boxplots
logging.info("Checking for outliers in numerical features...")
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
for feature in numerical_features:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df[feature])
    plt.title(f"Boxplot of {feature}")
    plt.show()

# 2. Data Transformation and Cleaning
# Check for missing values
logging.info(f"Missing values:\n{df.isnull().sum()}\n")

# Fill missing values for numerical columns only
df[numerical_features] = df[numerical_features].fillna(df[numerical_features].mean())

# Separate features and target variables
logging.info("Separating features and target variables.")
if 'status_Won' in df.columns:
    # For regression: Selling_Price as target
    X = df.drop(['selling_price', 'id', 'material_ref', 'product_ref', 'status_Lost', 'status_Not lost for AM',
                'status_Offerable', 'status_Offered', 'status_Revised', 'status_To be approved',
                'status_Wonderful', 'item type_Others', 'item type_PL', 'item type_S', 'item type_SLAWR',
                'item type_W', 'item type_WI'], axis=1)  # Drop non-relevant columns
    
    y_regression = df['selling_price']

    # For classification: status_Won as target (1 for 'Won', 0 for others)
    y_classification = df['status_Won'].map({1: 1, 0: 0})  # Ensure the target is binary
else:
    # Handle the case when 'status_Won' is missing (for regression only)
    logging.warning("status_Won column not found. Proceeding with regression only.")
    X = df.drop(['selling_price', 'id', 'material_ref', 'product_ref'], axis=1)  # Drop non-relevant columns
    y_regression = df['selling_price']
    y_classification = None  # Set to None as classification is not possible

# One-hot encode categorical variables if necessary
logging.info("One-hot encoding categorical variables...")
X = pd.get_dummies(X, drop_first=True)

# Handle skewness for regression target variable
# Apply log transformation if 'selling_price' is skewed
if df['selling_price'].skew() > 0.5:  # Adjust threshold as needed
    logging.info("Selling Price is skewed. Applying log transformation.")
    y_regression = np.log1p(y_regression)  # Log transformation

# Split the dataset into train and test sets
logging.info("Splitting the dataset into train and test sets.")
X_train, X_test, y_train_regression, y_test_regression = train_test_split(X, y_regression, test_size=0.2, random_state=42)

# Only split for classification if 'status_Won' exists
if y_classification is not None:
    _, _, y_train_classification, y_test_classification = train_test_split(X, y_classification, test_size=0.2, random_state=42)
else:
    y_train_classification = y_test_classification = None

# Feature Scaling (Standardization)
logging.info("Applying feature scaling (standardization)...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Machine Learning Models

## Regression Model - RandomForestRegressor
logging.info("Training RandomForestRegressor for regression task.")
rf_regressor = RandomForestRegressor(random_state=42)
rf_regressor.fit(X_train_scaled, y_train_regression)

# Predict and evaluate on the test set
logging.info("Making predictions and evaluating the RandomForestRegressor model.")
y_pred_regression = rf_regressor.predict(X_test_scaled)
mae = mean_absolute_error(y_test_regression, y_pred_regression)
mse = mean_squared_error(y_test_regression, y_pred_regression)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_regression, y_pred_regression)

logging.info("Regression Model (RandomForestRegressor) Performance:")
logging.info(f"MAE: {mae}")
logging.info(f"MSE: {mse}")
logging.info(f"RMSE: {rmse}")
logging.info(f"R²: {r2}")

# Hyperparameter tuning for the regression model
logging.info("Performing RandomizedSearchCV for regression model.")
param_dist_regression = {
    'n_estimators': [100, 200, 300, 400],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

random_search_regression = RandomizedSearchCV(estimator=rf_regressor, param_distributions=param_dist_regression, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
random_search_regression.fit(X_train_scaled, y_train_regression)

logging.info(f"Best Parameters for Regression: {random_search_regression.best_params_}")
best_rf_regressor = random_search_regression.best_estimator_

# 4. Classification Model - RandomForestClassifier (only if 'status_Won' exists)
if y_classification is not None:
    logging.info("Training RandomForestClassifier for classification task.")
    # Initialize and train the RandomForestClassifier
    rf_classifier = RandomForestClassifier(random_state=42)
    rf_classifier.fit(X_train_scaled, y_train_classification)

    # Predict and evaluate on the test set
    logging.info("Making predictions and evaluating the RandomForestClassifier model.")
    y_pred_classification = rf_classifier.predict(X_test_scaled)
    accuracy = accuracy_score(y_test_classification, y_pred_classification)
    conf_matrix = confusion_matrix(y_test_classification, y_pred_classification)

    logging.info("Classification Model (RandomForestClassifier) Performance:")
    logging.info(f"Accuracy: {accuracy}")
    logging.info(f"Confusion Matrix:\n{conf_matrix}")

    # Hyperparameter tuning for the classification model
    logging.info("Performing RandomizedSearchCV for classification model.")
    param_dist_classification = {
        'n_estimators': [100, 200, 300, 400],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }

    random_search_classification = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_dist_classification, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
    random_search_classification.fit(X_train_scaled, y_train_classification)

    logging.info(f"Best Parameters for Classification: {random_search_classification.best_params_}")
    best_rf_classifier = random_search_classification.best_estimator_

# 5. Saving the Best Models and Scaler
model_save_path_regression = 'D:/Final_Projects/random_forest_best_regressor.joblib'
model_save_path_classification = 'D:/Final_Projects/random_forest_best_classifier.joblib'
scaler_save_path = 'D:/Final_Projects/scaler.joblib'

logging.info(f"Saving the best regression model to {model_save_path_regression}.")
joblib.dump(best_rf_regressor, model_save_path_regression)

if y_classification is not None:
    logging.info(f"Saving the best classification model to {model_save_path_classification}.")
    joblib.dump(best_rf_classifier, model_save_path_classification)

logging.info(f"Saving the scaler to {scaler_save_path}.")
joblib.dump(scaler, scaler_save_path)

logging.info("Model training and saving completed.")


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, confusion_matrix
import joblib
import logging

# Set up logging to both console and file
log_file = 'script_logs.log'
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Create a file handler for logging to a file
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)

# Add the file handler to the root logger
logging.getLogger().addHandler(file_handler)

# Create a console handler for logging to the console
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(formatter)

# Add the console handler to the root logger
logging.getLogger().addHandler(console_handler)

# Log the start of the script
logging.info("Starting data preprocessing and model training script.")

# Load the cleaned dataset
file_path = r'D:/Final_Projects/New_Cleaned_Copper_modelling_project.csv'
logging.info(f"Loading dataset from {file_path}...")
df = pd.read_csv(file_path)
logging.info("Dataset loaded successfully.")

# Check for the columns in the dataset
logging.info(f"Columns in dataset: {df.columns}\n")

# Format 'delivery_date' to DD/MM/YY if exists
if 'delivery_date' in df.columns:
    df['delivery_date'] = pd.to_datetime(df['delivery_date'], errors='coerce').dt.strftime('%d/%m/%y')
    logging.info("Delivery date formatted to DD/MM/YY.")

# Check for skewness
logging.info(f"Skewness in numerical features:\n{df.skew()}\n")

# Visualizing skewness with histograms
logging.info("Visualizing skewness in numerical features...")
df.hist(bins=50, figsize=(20, 15))
plt.show()

# Checking for outliers using boxplots
logging.info("Checking for outliers in numerical features...")
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
for feature in numerical_features:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df[feature])
    plt.title(f"Boxplot of {feature}")
    plt.show()

# 2. Data Transformation and Cleaning
# Remove the 'id' column as requested
logging.info("Removing 'id' column from dataset.")
df = df.drop(columns=['id'], axis=1)

# Check for missing values
logging.info(f"Missing values:\n{df.isnull().sum()}\n")

# Fill missing values for numerical columns only
df[numerical_features] = df[numerical_features].fillna(df[numerical_features].mean())

# Separate features and target variables
logging.info("Separating features and target variables.")
if 'status_Won' in df.columns:
    # For regression: Selling_Price as target
    X = df.drop(['selling_price', 'material_ref', 'product_ref', 'status_Lost', 'status_Not lost for AM',
                'status_Offerable', 'status_Offered', 'status_Revised', 'status_To be approved',
                'status_Wonderful', 'item type_Others', 'item type_PL', 'item type_S', 'item type_SLAWR',
                'item type_W', 'item type_WI'], axis=1)  # Drop non-relevant columns
    
    y_regression = df['selling_price']

    # For classification: status_Won as target (1 for 'Won', 0 for others)
    y_classification = df['status_Won'].map({1: 1, 0: 0})  # Ensure the target is binary
else:
    # Handle the case when 'status_Won' is missing (for regression only)
    logging.warning("status_Won column not found. Proceeding with regression only.")
    X = df.drop(['selling_price', 'material_ref', 'product_ref'], axis=1)  # Drop non-relevant columns
    y_regression = df['selling_price']
    y_classification = None  # Set to None as classification is not possible

# One-hot encode categorical variables if necessary
logging.info("One-hot encoding categorical variables...")
X = pd.get_dummies(X, drop_first=True)

# Handle skewness for regression target variable
# Apply log transformation if 'selling_price' is skewed
if df['selling_price'].skew() > 0.5:  # Adjust threshold as needed
    logging.info("Selling Price is skewed. Applying log transformation.")
    y_regression = np.log1p(y_regression)  # Log transformation

# Split the dataset into train and test sets
logging.info("Splitting the dataset into train and test sets.")
X_train, X_test, y_train_regression, y_test_regression = train_test_split(X, y_regression, test_size=0.2, random_state=42)

# Only split for classification if 'status_Won' exists
if y_classification is not None:
    _, _, y_train_classification, y_test_classification = train_test_split(X, y_classification, test_size=0.2, random_state=42)
else:
    y_train_classification = y_test_classification = None

# Feature Scaling (Standardization)
logging.info("Applying feature scaling (standardization)...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Machine Learning Models

## Regression Model - RandomForestRegressor
logging.info("Training RandomForestRegressor for regression task.")
rf_regressor = RandomForestRegressor(random_state=42)
rf_regressor.fit(X_train_scaled, y_train_regression)

# Predict and evaluate on the test set
logging.info("Making predictions and evaluating the RandomForestRegressor model.")
y_pred_regression = rf_regressor.predict(X_test_scaled)
mae = mean_absolute_error(y_test_regression, y_pred_regression)
mse = mean_squared_error(y_test_regression, y_pred_regression)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_regression, y_pred_regression)

logging.info("Regression Model (RandomForestRegressor) Performance:")
logging.info(f"MAE: {mae}")
logging.info(f"MSE: {mse}")
logging.info(f"RMSE: {rmse}")
logging.info(f"R²: {r2}")

# DecisionTreeRegressor
logging.info("Training DecisionTreeRegressor for regression task.")
dt_regressor = DecisionTreeRegressor(random_state=42)
dt_regressor.fit(X_train_scaled, y_train_regression)

# Predict and evaluate on the test set
y_pred_dt_regression = dt_regressor.predict(X_test_scaled)
mae_dt = mean_absolute_error(y_test_regression, y_pred_dt_regression)
mse_dt = mean_squared_error(y_test_regression, y_pred_dt_regression)
rmse_dt = np.sqrt(mse_dt)
r2_dt = r2_score(y_test_regression, y_pred_dt_regression)

logging.info("Regression Model (DecisionTreeRegressor) Performance:")
logging.info(f"MAE: {mae_dt}")
logging.info(f"MSE: {mse_dt}")
logging.info(f"RMSE: {rmse_dt}")
logging.info(f"R²: {r2_dt}")

# Hyperparameter tuning for the regression model
logging.info("Performing RandomizedSearchCV for regression model.")
param_dist_regression = {
    'n_estimators': [100, 200, 300, 400],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

random_search_regression = RandomizedSearchCV(estimator=rf_regressor, param_distributions=param_dist_regression, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
random_search_regression.fit(X_train_scaled, y_train_regression)

logging.info(f"Best Parameters for Regression: {random_search_regression.best_params_}")
best_rf_regressor = random_search_regression.best_estimator_

# 4. Classification Model - RandomForestClassifier (only if 'status_Won' exists)
if y_classification is not None:
    logging.info("Training RandomForestClassifier for classification task.")
    rf_classifier = RandomForestClassifier(random_state=42)
    rf_classifier.fit(X_train_scaled, y_train_classification)

    # Predict and evaluate on the test set
    logging.info("Making predictions and evaluating the RandomForestClassifier model.")
    y_pred_classification = rf_classifier.predict(X_test_scaled)
    accuracy = accuracy_score(y_test_classification, y_pred_classification)
    cm = confusion_matrix(y_test_classification, y_pred_classification)

    logging.info("Classification Model (RandomForestClassifier) Performance:")
    logging.info(f"Accuracy: {accuracy}")
    logging.info(f"Confusion Matrix: \n{cm}")

    # Hyperparameter tuning for the classification model
    logging.info("Performing RandomizedSearchCV for classification model.")
    param_dist_classification = {
        'n_estimators': [100, 200, 300, 400],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }

    random_search_classification = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_dist_classification, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
    random_search_classification.fit(X_train_scaled, y_train_classification)

    logging.info(f"Best Parameters for Classification: {random_search_classification.best_params_}")
    best_rf_classifier = random_search_classification.best_estimator_

# Save models and scaler
logging.info("Saving models and scaler.")
joblib.dump(best_rf_regressor, 'best_rf_regressor.pkl')
joblib.dump(best_rf_classifier, 'best_rf_classifier.pkl')
joblib.dump(scaler, 'scaler.pkl')

logging.info("Script execution completed.")
