# **Classification Model**



```
# Aim is to predict status either "WON" or "Lost"
```



In [41]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc , accuracy_score
import matplotlib.pyplot as plt
import pickle
import pandas as pd

In [22]:
df = pd.read_csv('/content/icm_cleaned_data.csv')

In [11]:
df

Unnamed: 0,quantity_tons,customer,country,item_type,application,thickness,width,product_ref,item_date_day,item_date_month,item_date_year,delivery_date_day,delivery_date_month,delivery_date_year,delivery_item_date_difference_num,selling_price,status
0,3.991779,30156308.0,28.0,5.0,2.302585,0.693147,7.313220,1670798778,1,4,2021,1,7,2021,91,6.749931,1.0
1,4.235147,30341428.0,38.0,3.0,2.302585,-0.510826,7.150701,1668701376,1,4,2021,1,7,2021,91,7.217443,1.0
2,3.314642,30165529.0,78.0,5.0,2.302585,-0.287682,7.130899,164141591,1,4,2021,1,7,2021,91,7.001246,1.0
3,3.473063,30202362.0,27.0,3.0,2.302585,-0.510826,6.917706,628377,1,4,2021,1,7,2021,91,6.880384,1.0
4,3.035295,30211222.0,30.0,5.0,2.302585,-0.798508,6.802395,611993,1,4,2021,1,7,2021,91,7.181736,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150439,4.629691,30200854.0,25.0,5.0,3.713572,-0.040822,7.106606,164141591,2,7,2020,1,7,2020,29,6.381816,1.0
150440,5.337954,30200854.0,25.0,5.0,3.713572,-0.051293,7.313220,164141591,2,7,2020,1,7,2020,29,6.378426,1.0
150441,1.443523,30200854.0,25.0,5.0,3.713572,-0.342490,7.130899,164141591,2,7,2020,1,7,2020,29,6.428105,1.0
150442,-0.333362,30200854.0,25.0,5.0,3.713572,-0.162519,7.130899,164141591,2,7,2020,1,7,2020,29,6.398595,1.0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150444 entries, 0 to 150443
Data columns (total 17 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   quantity_tons                      150444 non-null  float64
 1   customer                           150443 non-null  float64
 2   country                            150444 non-null  float64
 3   item_type                          150444 non-null  float64
 4   application                        150444 non-null  float64
 5   thickness                          150444 non-null  float64
 6   width                              150444 non-null  float64
 7   product_ref                        150444 non-null  int64  
 8   item_date_day                      150444 non-null  int64  
 9   item_date_month                    150444 non-null  int64  
 10  item_date_year                     150444 non-null  int64  
 11  delivery_date_day                  1504

In [15]:
df['status'].value_counts()

1.0    116010
0.0     34434
Name: status, dtype: int64

In [19]:
df['customer'].fillna(method='bfill',inplace=True)

df.isnull().sum()

quantity_tons                        0
customer                             0
country                              0
item_type                            0
application                          0
thickness                            0
width                                0
product_ref                          0
item_date_day                        0
item_date_month                      0
item_date_year                       0
delivery_date_day                    0
delivery_date_month                  0
delivery_date_year                   0
delivery_item_date_difference_num    0
selling_price                        0
status                               0
dtype: int64

In [28]:
x = df.drop('status',axis=1)

y = df['status'] # 1 - WON , 0 -- Lost

In [29]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')  # You can choose a different strategy
X = imputer.fit_transform(x)

x = pd.DataFrame(x,columns=df.drop('status',axis=1).columns)

In [30]:
x

Unnamed: 0,quantity_tons,customer,country,item_type,application,thickness,width,product_ref,item_date_day,item_date_month,item_date_year,delivery_date_day,delivery_date_month,delivery_date_year,delivery_item_date_difference_num,selling_price
0,3.991779,30156308.0,28.0,5.0,2.302585,0.693147,7.313220,1670798778,1,4,2021,1,7,2021,91,6.749931
1,4.235147,30341428.0,38.0,3.0,2.302585,-0.510826,7.150701,1668701376,1,4,2021,1,7,2021,91,7.217443
2,3.314642,30165529.0,78.0,5.0,2.302585,-0.287682,7.130899,164141591,1,4,2021,1,7,2021,91,7.001246
3,3.473063,30202362.0,27.0,3.0,2.302585,-0.510826,6.917706,628377,1,4,2021,1,7,2021,91,6.880384
4,3.035295,30211222.0,30.0,5.0,2.302585,-0.798508,6.802395,611993,1,4,2021,1,7,2021,91,7.181736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150439,4.629691,30200854.0,25.0,5.0,3.713572,-0.040822,7.106606,164141591,2,7,2020,1,7,2020,29,6.381816
150440,5.337954,30200854.0,25.0,5.0,3.713572,-0.051293,7.313220,164141591,2,7,2020,1,7,2020,29,6.378426
150441,1.443523,30200854.0,25.0,5.0,3.713572,-0.342490,7.130899,164141591,2,7,2020,1,7,2020,29,6.428105
150442,-0.333362,30200854.0,25.0,5.0,3.713572,-0.162519,7.130899,164141591,2,7,2020,1,7,2020,29,6.398595


In [44]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')

def model_classification(df,algorithm):
  for i in algorithm:

    xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2)

    X = imputer.fit_transform(xtrain)

    xtrain = pd.DataFrame(X,columns=df.drop('status',axis=1).columns)

    model = i().fit(xtrain,ytrain)



    # predict for train and test accuracy
    y_train_pred = model.predict(xtrain)

    X = imputer.fit_transform(xtest)

    xtest = pd.DataFrame(X,columns=df.drop('status',axis=1).columns)


    y_test_pred  = model.predict(xtest)

    # Accuracy score

    training = accuracy_score(ytrain,y_train_pred)

    testing = accuracy_score(ytest,y_test_pred)

    data = {'Algorithm':i.__name__, 'Training Acuuracy':training,'Testing Accuracy':testing}

    print(data)


In [45]:
model_classification(df,[DecisionTreeClassifier,ExtraTreesClassifier,RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,XGBClassifier])

{'Algorithm': 'DecisionTreeClassifier', 'Training Acuuracy': 1.0, 'Testing Accuracy': 0.9588886303964904}
{'Algorithm': 'ExtraTreesClassifier', 'Training Acuuracy': 1.0, 'Testing Accuracy': 0.9744757220246602}
{'Algorithm': 'RandomForestClassifier', 'Training Acuuracy': 1.0, 'Testing Accuracy': 0.9721492904383662}
{'Algorithm': 'AdaBoostClassifier', 'Training Acuuracy': 0.8250675086203315, 'Testing Accuracy': 0.8260161520821563}
{'Algorithm': 'GradientBoostingClassifier', 'Training Acuuracy': 0.8448257239001288, 'Testing Accuracy': 0.8460566984612317}
{'Algorithm': 'XGBClassifier', 'Training Acuuracy': 0.948369407170454, 'Testing Accuracy': 0.9357240187443916}


In [46]:
# hyper parameter tuning  is a process of selecting best parameter valuues from set of parameters values  to increase the model performace
# Grid search cv is a method for hyper parameter

x = df.drop('status', axis=1)
y = df['status']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

param_grid = {'max_depth'        : [2, 5, 10, 20],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf' : [1, 2, 4],
              'max_features'     : ['sqrt', 'log2']}

grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train, y_train)


# Best Parameter
grid_search.best_params_


{'max_depth': 20,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [47]:
grid_search.best_score_

0.9683187237754975

In [58]:
# Model building with Best Parameters
x = df.drop('status', axis=1)
y = df['status']


xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2)

X = imputer.fit_transform(xtrain)
xtrain = pd.DataFrame(X,columns=df.drop('status',axis=1).columns)

# Model
model = RandomForestClassifier(max_depth=20, max_features='sqrt', min_samples_leaf=1, min_samples_split=2).fit(xtrain, ytrain)

X = imputer.fit_transform(xtest)
xtest = pd.DataFrame(X,columns=df.drop('status',axis=1).columns)

# predicition
y_pred_train = model.predict(xtrain)
y_pred_test = model.predict(xtest)
accuracy_train = metrics.accuracy_score(ytrain, y_pred_train)
accuracy_test = metrics.accuracy_score(ytest, y_pred_test)
('Training accuracy :',accuracy_train),( 'Testing Accuracy :', accuracy_test)

(('Training accuracy :', 0.9979809729550081),
 ('Testing Accuracy :', 0.9704875535910134))

In [59]:
# Confusion matrix and other metrics do here

In [63]:
df.columns

Index(['quantity_tons', 'customer', 'country', 'item_type', 'application',
       'thickness', 'width', 'product_ref', 'item_date_day', 'item_date_month',
       'item_date_year', 'delivery_date_day', 'delivery_date_month',
       'delivery_date_year', 'delivery_item_date_difference_num',
       'selling_price', 'status'],
      dtype='object')

In [None]:
quantity_tons = float(input('Enter the Quantity Ton value :'))

customer = float(input('Enter the customer ID :'))

country = float(input('Enter the country code :'))

item_type = float(input('Enter the item type value :'))

application = float(input('Enter the application value :'))

thickness = float(input('Enter the thickness value :'))

width = float(input('Enter the width value :'))

product_ref = float(input('Enter the product ref value :'))

item_date_day = float(input('Enter the item day :'))

item_date_month = float(input('Enter the item month:'))

item_date_year = float(input('Enter the item year :'))

delivery_date_day = float(input('Enter the delivery day :'))

delivery_date_month = float(input('Enter the delivery month :'))

delivery_date_year = float(input('Enter the delivery year :'))

delivery_item_date_difference_num = float(input('Enter the diffrence date :'))

selling_price = float(input('Enter the selling price :'))


In [70]:
x = model.predict([[quantity_tons, customer, country, item_type, application,
       thickness, width, product_ref, item_date_day, item_date_month,
       item_date_year, delivery_date_day, delivery_date_month,
       delivery_date_year, delivery_item_date_difference_num,
       selling_price]])
if x[0]==1.0:
  print('WON')
else :
  print('LOST')

WON




In [71]:
import pickle

with open('ICM_classification_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [73]:
with open('/content/ICM_classification_model.pkl', 'rb') as f:
    model = pickle.load(f)

res = model.predict([[quantity_tons, customer, country, item_type, application,
       thickness, width, product_ref, item_date_day, item_date_month,
       item_date_year, delivery_date_day, delivery_date_month,
       delivery_date_year, delivery_item_date_difference_num,
       selling_price]])

if res[0] == 1:
    print('WON')
else:
    print('LOST')

WON




# **Regression Model**



```
# Aim is to predict selling price
```



In [74]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
import pickle

In [78]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')

# Model building with Best Parameters
x = df.drop('selling_price', axis=1)
y = df['selling_price']

def model_regression(df,algorithm):
  for i in algorithm:

    xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2)

    X = imputer.fit_transform(xtrain)

    xtrain = pd.DataFrame(X,columns=df.drop('selling_price',axis=1).columns)

    model = i().fit(xtrain,ytrain)


    # predict for train and test accuracy
    y_train_pred = model.predict(xtrain)

    X = imputer.fit_transform(xtest)

    xtest = pd.DataFrame(X,columns=df.drop('selling_price',axis=1).columns)


    y_test_pred  = model.predict(xtest)

    # Accuracy score

    training = r2_score(ytrain,y_train_pred)

    testing = r2_score(ytest,y_test_pred)

    data = {'Algorithm':i.__name__, 'Training R2 Score':training,'Testing R2 Score':testing}

    print(data)


In [80]:
model_regression(df,[DecisionTreeRegressor,ExtraTreesRegressor,RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor,XGBRegressor])

{'Algorithm': 'DecisionTreeRegressor', 'Training R2 Score': 0.9999153910877334, 'Testing R2 Score': 0.9306949309176002}
{'Algorithm': 'ExtraTreesRegressor', 'Training R2 Score': 0.9999057381489032, 'Testing R2 Score': 0.9563161458859544}
{'Algorithm': 'RandomForestRegressor', 'Training R2 Score': 0.9942569017463697, 'Testing R2 Score': 0.9595641149243487}
{'Algorithm': 'AdaBoostRegressor', 'Training R2 Score': 0.6626261259514357, 'Testing R2 Score': 0.6634072108871247}
{'Algorithm': 'GradientBoostingRegressor', 'Training R2 Score': 0.8924796037067001, 'Testing R2 Score': 0.8927571058486343}
{'Algorithm': 'XGBRegressor', 'Training R2 Score': 0.9526978289144035, 'Testing R2 Score': 0.9468744069597663}


In [None]:
x = df.drop(columns=['selling_price'], axis=1)
y = df['selling_price']
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2,random_state=42)

param_grid_r = {'max_depth'      : [2, 5, 10, 20],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf' : [1, 2, 4],
              'max_features'     : ['sqrt', 'log2', None]}

grid_search_r = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid_r, cv=5, n_jobs=-1)
grid_search_r.fit(x_train, y_train)

In [None]:
grid_search_r.best_params_,grid_search_r.best_score_

In [81]:
x = df.drop(columns=['selling_price'], axis=1)
y = df['selling_price']
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2,random_state=42)
X = imputer.fit_transform(xtrain)
xtrain = pd.DataFrame(X,columns=df.drop('selling_price',axis=1).columns)

model = RandomForestRegressor(max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2).fit(xtrain, ytrain)
y_pred_train = model.predict(xtrain)

X = imputer.fit_transform(xtest)

xtest = pd.DataFrame(X,columns=df.drop('selling_price',axis=1).columns)

y_pred_test = model.predict(xtest)

r2_train = r2_score(ytrain, y_pred_train)
r2_test = r2_score(ytest, y_pred_test)
r2_train, r2_test

(0.988386448921512, 0.9607786374972789)

In [83]:
quantity_tons = float(input('Enter the Quantity Ton value :'))

customer = float(input('Enter the customer ID :'))

country = float(input('Enter the country code :'))

item_type = float(input('Enter the item type value :'))

application = float(input('Enter the application value :'))

thickness = float(input('Enter the thickness value :'))

width = float(input('Enter the width value :'))

product_ref = float(input('Enter the product ref value :'))

item_date_day = float(input('Enter the item day :'))

item_date_month = float(input('Enter the item month:'))

item_date_year = float(input('Enter the item year :'))

delivery_date_day = float(input('Enter the delivery day :'))

delivery_date_month = float(input('Enter the delivery month :'))

delivery_date_year = float(input('Enter the delivery year :'))

delivery_item_date_difference_num = float(input('Enter the diffrence date :'))

status = float(input('Enter the  status :'))


Enter the Quantity Ton value :3.991779
Enter the customer ID :30156308.0
Enter the country code :28.0
Enter the item type value :5.0
Enter the application value :2.302585
Enter the thickness value :0.693147
Enter the width value :7.313220
Enter the product ref value : 1670798778
Enter the item day :1
Enter the item month:4
Enter the item year :2021
Enter the delivery day :1
Enter the delivery month :7
Enter the delivery year :2021
Enter the diffrence date :91
Enter the selling price :1.0


In [84]:
selling_price = model.predict([[quantity_tons, customer, country, item_type, application,
       thickness, width, product_ref, item_date_day, item_date_month,
       item_date_year, delivery_date_day, delivery_date_month,
       delivery_date_year, delivery_item_date_difference_num,
       status]])



f'predicted selling price : {selling_price}'



'predicted selling price : [6.90673355]'

In [86]:
with open('ICM_regression_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [88]:
with open('/content/ICM_regression_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [89]:
selling_price = model.predict([[quantity_tons, customer, country, item_type, application,
       thickness, width, product_ref, item_date_day, item_date_month,
       item_date_year, delivery_date_day, delivery_date_month,
       delivery_date_year, delivery_item_date_difference_num,
       status]])



f'predicted selling price : {selling_price}'



'predicted selling price : [6.90673355]'