In [1]:
import pandas as pd

excel_file_path = 'DataSet\Copper_Set.csv'
# List of encodings to try
encodings_to_try = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']

# Try reading the Excel file with different encodings
for encoding in encodings_to_try:
    try:
        df = pd.read_csv(excel_file_path, encoding=encoding)
        # If reading succeeds, break out of the loop
        break
    except Exception as e:
        print(f"Failed with encoding '{encoding}': {e}")

# Now, 'df' contains the data from your Excel file with the correct encoding.

Failed with encoding 'utf-8': 'utf-8' codec can't decode byte 0xe6 in position 2: invalid continuation byte


  df = pd.read_csv(excel_file_path, encoding=encoding)


In [2]:
df.shape

(181673, 14)

In [3]:
df.isnull().sum()

id                   2
item_date            1
quantity tons        0
customer             1
country             28
status               2
item type            0
application         24
thickness            1
width                0
material_ref     77919
product_ref          0
delivery date        1
selling_price        1
dtype: int64

In [4]:
counts = df['material_ref'].value_counts()
print(counts)

material_ref
DX51D+Z                          922
G9010                            856
DC01 RED.FLAT KUCHNIA EKO 2.2    777
DC01                             725
G9006                            671
                                ... 
DX51D+Z C14S                       1
124197                             1
124198                             1
48065                              1
202006170005.IO.1.1                1
Name: count, Length: 16410, dtype: int64


In [5]:
df['material_ref'] = df['material_ref'].astype(str)

In [6]:
import numpy as np
def replace_starting_with_zeros(value):
    if value.startswith('00000'):
        return np.nan
    else:
        return value

# Apply the function to the DataFrame column
df['material_ref'] = df['material_ref'].apply(replace_starting_with_zeros)
# Now, 'df' contains the updated values with '00000' replaced by NaN
counts = df['material_ref'].value_counts()

print(counts)

material_ref
nan                              77919
DX51D+Z                            922
G9010                              856
DC01 RED.FLAT KUCHNIA EKO 2.2      777
DC01                               725
                                 ...  
DX51D+Z C14S                         1
124197                               1
124198                               1
48065                                1
202006170005.IO.1.1                  1
Name: count, Length: 16404, dtype: int64


In [8]:

column_name = 'quantity tons'
df[column_name] = pd.to_numeric(df[column_name], errors='coerce', downcast='float')


In [9]:
column_names = ['country', 'item_date', 'customer', 'status' , 'application' ,'quantity tons' , 'thickness' , 'delivery date' , 'selling_price' ]
# replacing country col null values with median of the column
for column_name in column_names: 
    df[column_name] = df[column_name].fillna(df[column_name].mode().iloc[0])
df['material_ref'] = df['material_ref'].bfill() # in case 1st col is null it is covered
df['material_ref'] = df['material_ref'].ffill()
df.isnull().sum()


id               2
item_date        0
quantity tons    0
customer         0
country          0
status           0
item type        0
application      0
thickness        0
width            0
material_ref     0
product_ref      0
delivery date    0
selling_price    0
dtype: int64

In [10]:
df

Unnamed: 0,id,item_date,quantity tons,customer,country,status,item type,application,thickness,width,material_ref,product_ref,delivery date,selling_price
0,EC06F063-9DF0-440C-8764-0B0C05A4F6AE,20210401.0,54.151139,30156308.0,28.0,Won,W,10.0,2.00,1500.0,DEQ1 S460MC,1670798778,20210701.0,854.00
1,4E5F4B3D-DDDF-499D-AFDE-A3227EC49425,20210401.0,768.024839,30202938.0,25.0,Won,W,41.0,0.80,1210.0,104991,1668701718,20210401.0,1047.00
2,E140FF1B-2407-4C02-A0DD-780A093B1158,20210401.0,386.127949,30153963.0,30.0,Won,WI,28.0,0.38,952.0,S0380700,628377,20210101.0,644.33
3,F8D507A0-9C62-4EFE-831E-33E1DA53BB50,20210401.0,202.411065,30349574.0,32.0,Won,S,59.0,2.30,1317.0,DX51D+ZM310MAO 2.3X1317,1668701718,20210101.0,768.00
4,4E1C4E78-152B-430A-8094-ADD889C9D0AD,20210401.0,785.526262,30211560.0,28.0,Won,W,10.0,4.00,2000.0,2_S275JR+AR-CL1,640665,20210301.0,577.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181668,DE633116-D1DF-4846-982E-55EFC3658A76,20200702.0,102.482422,30200854.0,25.0,Won,W,41.0,0.96,1220.0,1000777,164141591,20200701.0,591.00
181669,A48374B1-E6DB-45F2-889A-1F9C27C099EB,20200702.0,208.086469,30200854.0,25.0,Won,W,41.0,0.95,1500.0,1000227,164141591,20200701.0,589.00
181670,91643238-5C7B-4237-9A5F-63AE3D35F320,20200702.0,4.235594,30200854.0,25.0,Won,W,41.0,0.71,1250.0,1004216,164141591,20200701.0,619.00
181671,7AFFD323-01D9-4E15-B80D-7D1B03498FC8,20200702.0,-2000.000000,30200854.0,25.0,Won,W,41.0,0.85,1250.0,1001149,164141591,20200701.0,601.00


In [11]:
columns_to_drop = ['id', 'material_ref','product_ref','customer']
df = df.drop(columns_to_drop, axis=1)
df

Unnamed: 0,item_date,quantity tons,country,status,item type,application,thickness,width,delivery date,selling_price
0,20210401.0,54.151139,28.0,Won,W,10.0,2.00,1500.0,20210701.0,854.00
1,20210401.0,768.024839,25.0,Won,W,41.0,0.80,1210.0,20210401.0,1047.00
2,20210401.0,386.127949,30.0,Won,WI,28.0,0.38,952.0,20210101.0,644.33
3,20210401.0,202.411065,32.0,Won,S,59.0,2.30,1317.0,20210101.0,768.00
4,20210401.0,785.526262,28.0,Won,W,10.0,4.00,2000.0,20210301.0,577.00
...,...,...,...,...,...,...,...,...,...,...
181668,20200702.0,102.482422,25.0,Won,W,41.0,0.96,1220.0,20200701.0,591.00
181669,20200702.0,208.086469,25.0,Won,W,41.0,0.95,1500.0,20200701.0,589.00
181670,20200702.0,4.235594,25.0,Won,W,41.0,0.71,1250.0,20200701.0,619.00
181671,20200702.0,-2000.000000,25.0,Won,W,41.0,0.85,1250.0,20200701.0,601.00


In [12]:
dff = pd.get_dummies(df["item type"])
# convert true and flase to 1 and 0 
dff = dff.astype('float')
dff[['item_date','quantity tons','country','status','application','thickness','width','delivery date',
'selling_price']] = df[['item_date','quantity tons','country','status','application','thickness','width',
                        'delivery date','selling_price']]

dff

Unnamed: 0,IPL,Others,PL,S,SLAWR,W,WI,item_date,quantity tons,country,status,application,thickness,width,delivery date,selling_price
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,20210401.0,54.151139,28.0,Won,10.0,2.00,1500.0,20210701.0,854.00
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,20210401.0,768.024839,25.0,Won,41.0,0.80,1210.0,20210401.0,1047.00
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,20210401.0,386.127949,30.0,Won,28.0,0.38,952.0,20210101.0,644.33
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,20210401.0,202.411065,32.0,Won,59.0,2.30,1317.0,20210101.0,768.00
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,20210401.0,785.526262,28.0,Won,10.0,4.00,2000.0,20210301.0,577.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181668,0.0,0.0,0.0,0.0,0.0,1.0,0.0,20200702.0,102.482422,25.0,Won,41.0,0.96,1220.0,20200701.0,591.00
181669,0.0,0.0,0.0,0.0,0.0,1.0,0.0,20200702.0,208.086469,25.0,Won,41.0,0.95,1500.0,20200701.0,589.00
181670,0.0,0.0,0.0,0.0,0.0,1.0,0.0,20200702.0,4.235594,25.0,Won,41.0,0.71,1250.0,20200701.0,619.00
181671,0.0,0.0,0.0,0.0,0.0,1.0,0.0,20200702.0,-2000.000000,25.0,Won,41.0,0.85,1250.0,20200701.0,601.00


In [13]:
df_reg = dff.drop('status', axis=1)
df_class = dff.drop('selling_price', axis=1)

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

model = LinearRegression()
X=df_reg.drop('selling_price',axis=1)
y=df_reg['selling_price']
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
lr=LinearRegression()
lr.fit(x_train,y_train)
y_train_pred=lr.predict(x_train)
y_test_pred=lr.predict(x_test)
y_test_pred

array([1007.25292077, 1060.0250852 , 1050.14312158, ..., 1022.50910289,
       1096.18772875, 1052.15002994])

In [14]:
from sklearn.metrics import mean_squared_error,r2_score
import math 
print(" Root of Train error" , math.sqrt(mean_squared_error(y_train,y_train_pred)))
print("Root of Test error",math.sqrt(mean_squared_error(y_test,y_test_pred)))



 Root of Train error 221.34939839786654
Root of Test error 202.55783635750012


In [15]:
from sklearn.preprocessing import LabelEncoder
ordinal_cols=['status']
le=LabelEncoder()
for col in ordinal_cols:
    le.fit(df_class[col])
    df_class[col]=le.transform(df[col])

df_class

Unnamed: 0,Others,PL,S,W,WI,item_date,quantity tons,country,status,application,thickness,width,delivery date
0,0.0,0.0,1.0,0.0,0.0,20210401,927.430748,78,6,28,1.040,912.0,20210401
1,0.0,0.0,0.0,1.0,0.0,20210401,99.059199,30,6,41,0.595,1207.0,20210401
2,0.0,0.0,0.0,1.0,0.0,20210401,185.149656,25,6,41,4.000,1500.0,20210401
3,0.0,0.0,0.0,1.0,0.0,20210401,9.822404,28,5,10,1.000,1250.0,20210701
4,0.0,0.0,1.0,0.0,0.0,20210401,102.421773,78,6,28,1.040,1132.0,20210401
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9967,0.0,0.0,0.0,1.0,0.0,20210319,39.449376,28,6,10,2.000,1500.0,20210801
9968,0.0,0.0,0.0,1.0,0.0,20210319,14.350041,28,6,10,3.000,1000.0,20210801
9969,0.0,0.0,0.0,1.0,0.0,20210319,20.086637,28,6,10,3.000,1250.0,20210801
9970,0.0,0.0,0.0,1.0,0.0,20210319,40.345459,28,6,10,3.000,1500.0,20210801


In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = df_class.drop('status',axis=1)
y = df_class["status"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Logistic Regression
Logistic_Regression= LogisticRegression()
Logistic_Regression.fit(X_train, y_train)

# SVM Classifier
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

#KNN Classifier
Knn_classifier = KNeighborsClassifier()
Knn_classifier.fit(X_train, y_train)

#Decision Tree
DecisionTree_classifier = DecisionTreeClassifier(criterion='entropy',max_depth=100)
DecisionTree_classifier.fit(X_train, y_train)


# Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

#  Adaboost Classifier
Ab_classifier = AdaBoostClassifier()
Ab_classifier.fit(X_train, y_train)

# Gradinet boost Classifier
Gb_classifier = GradientBoostingClassifier()
Gb_classifier.fit(X_train, y_train)



# Evaluate the models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    return accuracy, conf_matrix, class_report

lr_accuracy, lr_conf_matrix, lr_class_report = evaluate_model(Logistic_Regression, X_test, y_test)
svm_accuracy, svm_conf_matrix, svm_class_report = evaluate_model(svm_classifier, X_test, y_test)
knn_accuracy, knn_conf_matrix, knn_class_report = evaluate_model(Knn_classifier, X_test, y_test)
decision_tree_accuracy, decision_tree_conf_matrix,decision_tree_class_report = evaluate_model(DecisionTree_classifier, X_test, y_test)
rf_accuracy, rf_conf_matrix, rf_class_report = evaluate_model(rf_classifier, X_test, y_test)
ab_accuracy, ab_conf_matrix, ab_class_report = evaluate_model(Ab_classifier, X_test, y_test)
gb_accuracy, gb_conf_matrix, gb_class_report = evaluate_model(Gb_classifier, X_test, y_test)


print("Logistic Regression Model:")
print("Accuracy:", lr_accuracy)
print("Confusion Matrix:\n", lr_conf_matrix)
print("Classification Report:\n", lr_class_report)

print("SVM Model:")
print("Accuracy:", svm_accuracy)
print("Confusion Matrix:\n", svm_conf_matrix)
print("Classification Report:\n", svm_class_report)

print("KNN Model:")
print("Accuracy:", knn_accuracy)
print("Confusion Matrix:\n", knn_conf_matrix)
print("Classification Report:\n", knn_class_report)

print("Decision Tree Model:")
print("Accuracy:", decision_tree_accuracy)
print("Confusion Matrix:\n", decision_tree_conf_matrix)
print("Classification Report:\n", decision_tree_class_report)


print("\nRandom Forest Model:")
print("Accuracy:", rf_accuracy)
print("Confusion Matrix:\n", rf_conf_matrix)
print("Classification Report:\n", rf_class_report)

print("\n Adaboost Forest Model:")
print("Accuracy:", ab_accuracy)
print("Confusion Matrix:\n", ab_conf_matrix)
print("Classification Report:\n", ab_class_report)

print("\nGradient Forest Model:")
print("Accuracy:", gb_accuracy)
print("Confusion Matrix:\n", gb_conf_matrix)
print("Classification Report:\n", gb_class_report)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Logistic Regression Model:
Accuracy: 0.7378446115288221
Confusion Matrix:
 [[   0    0    0    0    0    0   40    0]
 [   0    0    0    0    0    0  261    0]
 [   0    0    0    0    0    0  114    0]
 [   0    0    0    0    0    0    2    0]
 [   0    0    0    0    0    0   48    0]
 [   0    0    0    0    0    0   57    0]
 [   0    0    0    0    0    0 1472    0]
 [   0    0    0    0    0    0    1    0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       0.00      0.00      0.00       261
           2       0.00      0.00      0.00       114
           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00        48
           5       0.00      0.00      0.00        57
           6       0.74      1.00      0.85      1472
           7       0.00      0.00      0.00         1

    accuracy                           0.74      1995
   macro avg  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181673 entries, 0 to 181672
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   item_date      181673 non-null  float64
 1   quantity tons  181672 non-null  float64
 2   country        181673 non-null  float64
 3   status         181673 non-null  object 
 4   item type      181673 non-null  object 
 5   application    181673 non-null  float64
 6   thickness      181673 non-null  float64
 7   width          181673 non-null  float64
 8   delivery date  181673 non-null  float64
 9   selling_price  181673 non-null  float64
dtypes: float64(8), object(2)
memory usage: 13.9+ MB


In [None]:
column_name= 'item_date' 
df[column_name] = df[column_name].fillna(df[column_name].mode().iloc[0])

In [41]:
df_reg.isnull().sum()


IPL              0
Others           0
PL               0
S                0
SLAWR            0
W                0
WI               0
item_date        0
quantity tons    1
country          0
application      0
thickness        0
width            0
delivery date    0
selling_price    0
dtype: int64