In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelBinarizer
import pickle
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_excel(r'D:\datascience\Copper_project\Copper_Set.xlsx')

df = pd.DataFrame(data)
df

In [None]:
df.isnull().sum()

In [None]:
df.nunique()

In [None]:
df.dtypes

In [None]:
df['item_date1'] = pd.to_datetime(df['item_date'], format= '%Y%m%d', errors= 'coerce').dt.date
df['quantity tons'] = pd.to_numeric(df['quantity tons'], errors='coerce')
df['delivery date1'] = pd.to_datetime(df['delivery date'], format= '%Y%m%d', errors= 'coerce').dt.date
df['material_ref'] = df['material_ref'].str.lstrip('0')
df['material_ref'].fillna('unknown', inplace=True)

In [None]:
df.describe().T

In [None]:
df.loc[df['quantity tons'] < 0 , 'quantity tons'] = pd.NA
df.loc[df['selling_price'] < 0 , 'selling_price'] = pd.NA

In [None]:
df.isnull().sum()

In [None]:
df.country.fillna(df.country.mode()[0], inplace = True)
df.application.fillna(df.application.mode()[0], inplace = True)

In [None]:
df1 = df.copy()

In [None]:
df1 = df1.dropna()

In [None]:
df1.isnull().sum()

In [None]:
df1

In [None]:
df1.describe().T

In [None]:
def plot(df, i):
    plt.figure(figsize=(20,5))
    plt.subplot(1,3,1)
    sns.boxplot(df[i])
    plt.title(f'Box Plot for {i}')

    plt.subplot(1,3,2)
    sns.histplot(df[i], kde=True, bins=50)
    plt.title(f'Distribution Plot for {i}')

    plt.subplot(1,3,3)
    sns.violinplot(df[i])
    plt.title(f'Violin Plot for {i}')
    plt.show()

In [None]:
numeric_columns = df.select_dtypes(include=['number']).columns

for i in numeric_columns:
    plot(df1, i)

In [None]:
df1['quantity_log'] = np.log(df1['quantity tons'])
df1['selling_price_log'] = np.log(df1['selling_price'])
df1['thickness_log'] = np.log(df1['thickness'])

In [None]:
col = ['item type', 'application', 'country',  'width', 'quantity_log', 'selling_price_log', 'thickness_log', 'status', 'product_ref']

for i in col:
    plot(df1, i)

In [None]:
numeric_columns = df1.select_dtypes(include=['number'])
correlation_matrix = numeric_columns.corr()


plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix Heatmap')
plt.show()


In [None]:
def iqr_outliers(df, col):
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    df[col] = df[col].clip(lower_bound, upper_bound)

In [None]:
df2 = df1.copy()

columns = ['width', 'quantity_log', 'selling_price_log', 'thickness_log']

for i in columns:
    iqr_outliers(df2, i)

In [None]:
for i in columns:
    plot(df2, i)

In [None]:
df2

In [None]:
dfc = df2[df2['status'].isin(['Won', 'Lost'])]
dfc['status'].value_counts()

In [None]:
def ml_class(x, y, algorithm):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    model = algorithm().fit(x_train, y_train)

    y_pred_train = model.predict(x_train)
    y_pred_test = model.predict(x_test)

    accuracy_train = metrics.accuracy_score(y_train, y_pred_train)
    accuracy_test = metrics.accuracy_score(y_test, y_pred_test)

    accuracy_metrics = {'algorithm'    : algorithm.__name__,
                        'accuracy_train': accuracy_train,
                        'accuracy_test' : accuracy_test}
    
    return accuracy_metrics

In [None]:
x1 = dfc[['quantity_log', 'thickness_log', 'customer', 'country','application', 'selling_price_log', 'width', 'product_ref', 'item type']]
y1 = dfc['status']

oh = OneHotEncoder(handle_unknown= 'ignore', categories=[dfc['item type'].unique()])
oh.fit(x1[['item type']])
x_enc = oh.fit_transform(x1[['item type']]).toarray()

be = LabelBinarizer()
be.fit(y1)
y = be.fit_transform(y1).ravel()

x = np.concatenate((x1[['quantity_log', 'thickness_log', 'customer', 'country','application', 'selling_price_log', 'width', 'product_ref']], x_enc), axis= 1)
scaler = StandardScaler()

x = scaler.fit_transform(x)


In [None]:
print(ml_class(x, y, DecisionTreeClassifier))
print(ml_class(x, y, ExtraTreesClassifier))
print(ml_class(x, y, RandomForestClassifier))
print(ml_class(x, y, AdaBoostClassifier))
print(ml_class(x, y, GradientBoostingClassifier))

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

rfc = RandomForestClassifier(max_depth = 20, max_features = 'sqrt', min_samples_leaf = 1, min_samples_split = 2)

rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)

result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:") #tn,fn,fp,tp
print(result)
result1 = classification_report(y_test, y_pred)
print("Classification Report:",)
print (result1)

In [None]:
x1 = dfc[['quantity_log', 'thickness_log', 'customer', 'country','application', 'selling_price_log', 'width', 'product_ref', 'item type']]
y1 = dfc['status']

oh = OneHotEncoder(handle_unknown= 'ignore', categories=[dfc['item type'].unique()])
oh.fit(x1[['item type']])
x_enc = oh.fit_transform(x1[['item type']]).toarray()

be = LabelBinarizer()
be.fit(y1)
y = be.fit_transform(y1).ravel()

x = np.concatenate((x1[['quantity_log', 'thickness_log', 'customer', 'country','application', 'selling_price_log', 'width', 'product_ref']], x_enc), axis= 1)
scaler = StandardScaler()

x = scaler.fit_transform(x)


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

rfc = RandomForestClassifier(max_depth = 20, max_features = 'sqrt', min_samples_leaf = 1, min_samples_split = 2)

rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)

result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:") #tn,fn,fp,tp
print(result)
result1 = classification_report(y_test, y_pred)
print("Classification Report:",)
print (result1)

In [None]:
FP, TP, threshold = roc_curve(y_test, y_pred)
auc_curve = auc(x=FP, y=TP)
print(auc_curve)

In [None]:
plt.plot(FP, TP, label=f"ROC Curve (area={round(auc_curve, 2)}) ")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.10])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.show()

In [None]:
test_data = np.array([[5.45658, 1.15, 30156308, 32, 30, 6.6432, 1200, 628377, 'W']])
test_data_numeric = np.array(test_data[:, [0, 1, 2, 3, 4, 5, 6, 7]], dtype=float)

test_data_categorical = pd.DataFrame(test_data[:, [8]], columns=['item type'])

test_data_oh = oh.transform(test_data_categorical).toarray()

test_data_combined = np.concatenate((test_data_numeric, test_data_oh), axis=1)

test_data_combined_scaled = scaler.transform(test_data_combined)

pred = rfc.predict(test_data_combined_scaled)

if pred == 1:
    print("Won")
else:
    print("Lost")

In [None]:
with open(r"D:\datascience\Copper_project\clasification_model.pkl", 'wb') as f:
    pickle.dump(rfc, f)

with open(r"D:\datascience\Copper_project\scaler.pkl", 'wb') as f:
    pickle.dump(scaler, f)
     
with open(r"D:\datascience\Copper_project\encoder.pkl", 'wb') as f:
    pickle.dump(oh, f)

In [None]:
with open(r"D:\datascience\Copper_project\clasification_model.pkl", 'rb') as f:
    model = pickle.load(f)

# Load the scaler
with open(r"D:\datascience\Copper_project\scaler.pkl", 'rb') as f:
    scaler = pickle.load(f)

# Load the OneHotEncoder
with open(r"D:\datascience\Copper_project\encoder.pkl", 'rb') as f:
    oh = pickle.load(f)

test_data = np.array([[5, 2.2, 30223043, 78, 10, 7.13, 1500, 1668701718, 'S']])
test_data_numeric = np.array(test_data[:, [0, 1, 2, 3, 4, 5, 6, 7]], dtype=float)

test_data_categorical = pd.DataFrame(test_data[:, [8]], columns=['item type'])

test_data_oh = oh.transform(test_data_categorical).toarray()

test_data_combined = np.concatenate((test_data_numeric, test_data_oh), axis=1)

test_data_combined_scaled = scaler.transform(test_data_combined)

pred = model.predict(test_data_combined_scaled)


if pred == 1:
    print("Won")
else:
    print("Lost")


In [None]:
dfc.to_csv('D:\\datascience\\Copper_project\\copper_data_status.csv', index=False, header=True, encoding='utf-8')

In [None]:
df2.to_csv('D:\\datascience\\Copper_project\\copper_final_data.csv', index=False, header=True, encoding='utf-8')