<a href="https://colab.research.google.com/github/shohdy-source/Machine-Learing-Deep-Learning-/blob/main/Shipment%20Arrival%20Prediction%20_(variety%20of%20classification%20models).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Given data about online shipments, let's try to predict whether a given shipment will arrive on time.

We will use a variety of classification models to make our predictions.

In [5]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings(action='ignore')
import io


In [2]:
pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/47/80/8e9c57ec32dfed6ba2922bc5c96462cbf8596ce1a6f5de532ad1e43e53fe/catboost-0.25.1-cp37-none-manylinux1_x86_64.whl (67.3MB)
[K     |████████████████████████████████| 67.3MB 57kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.25.1


In [6]:
from google.colab import files


uploaded = files.upload()

Saving Train.csv to Train.csv


In [7]:
data = pd.read_csv(io.BytesIO(uploaded['Train.csv']))
data

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10994,10995,A,Ship,4,1,252,5,medium,F,1,1538,1
10995,10996,B,Ship,4,1,232,5,medium,F,6,1247,0
10996,10997,C,Ship,5,4,242,5,low,F,4,1155,0
10997,10998,F,Ship,5,2,223,6,medium,M,2,1210,0


In [8]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   10999 non-null  int64 
 1   Warehouse_block      10999 non-null  object
 2   Mode_of_Shipment     10999 non-null  object
 3   Customer_care_calls  10999 non-null  int64 
 4   Customer_rating      10999 non-null  int64 
 5   Cost_of_the_Product  10999 non-null  int64 
 6   Prior_purchases      10999 non-null  int64 
 7   Product_importance   10999 non-null  object
 8   Gender               10999 non-null  object
 9   Discount_offered     10999 non-null  int64 
 10  Weight_in_gms        10999 non-null  int64 
 11  Reached.on.Time_Y.N  10999 non-null  int64 
dtypes: int64(8), object(4)
memory usage: 1.0+ MB


**Preprocessing**


In [9]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [10]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop ID column
    df = df.drop('ID', axis=1)
    
    # Binary encoding
    df['Gender'] = df['Gender'].replace({'F': 0, 'M': 1})
    
    # One-hot encoding
    for column in ['Warehouse_block', 'Mode_of_Shipment', 'Product_importance']:
        df = onehot_encode(df, column=column)
    
    # Split df into X and y
    y = df['Reached.on.Time_Y.N']
    X = df.drop('Reached.on.Time_Y.N', axis=1)
     # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [11]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)


In [12]:
X_train


Unnamed: 0,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Gender,Discount_offered,Weight_in_gms,Warehouse_block_A,Warehouse_block_B,Warehouse_block_C,Warehouse_block_D,Warehouse_block_F,Mode_of_Shipment_Flight,Mode_of_Shipment_Road,Mode_of_Shipment_Ship,Product_importance_high,Product_importance_low,Product_importance_medium
4177,-0.051017,1.422394,-1.289402,-0.371349,1.001690,-0.266367,0.803593,-0.443829,-0.446551,-0.445505,-0.451563,1.411463,-0.434148,-0.441940,0.689133,-0.308738,1.039891,-0.873034
1616,-0.923855,0.715649,-1.874430,-0.371349,-0.998313,3.091967,-1.245664,2.253122,-0.446551,-0.445505,-0.451563,-0.708485,-0.434148,-0.441940,0.689133,-0.308738,1.039891,-0.873034
2775,-0.051017,-1.404585,-0.683481,-0.371349,1.001690,0.355547,-1.064775,-0.443829,2.239385,-0.445505,-0.451563,-0.708485,-0.434148,-0.441940,0.689133,-0.308738,1.039891,-0.873034
10272,-0.051017,0.715649,-1.059570,-0.371349,1.001690,-0.515133,0.489031,-0.443829,-0.446551,-0.445505,2.214530,-0.708485,2.303364,-0.441940,-1.451099,-0.308738,-0.961639,1.145431
6836,-0.051017,0.008904,0.758195,-0.371349,-0.998313,-0.763898,0.963634,2.253122,-0.446551,-0.445505,-0.451563,-0.708485,-0.434148,2.262749,-1.451099,3.238988,-0.961639,-0.873034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7813,-0.051017,-1.404585,0.611938,-0.371349,-0.998313,-0.577324,0.701805,-0.443829,-0.446551,-0.445505,-0.451563,1.411463,2.303364,-0.441940,-1.451099,-0.308738,1.039891,-0.873034
10955,-1.796694,1.422394,-1.790855,0.290460,1.001690,-0.515133,1.363429,-0.443829,-0.446551,-0.445505,-0.451563,1.411463,2.303364,-0.441940,-1.451099,-0.308738,1.039891,-0.873034
905,-0.051017,0.715649,-1.331190,-0.371349,1.001690,-0.017602,-1.531407,-0.443829,-0.446551,-0.445505,-0.451563,1.411463,-0.434148,-0.441940,0.689133,-0.308738,1.039891,-0.873034
5192,0.821822,0.715649,0.465681,-0.371349,1.001690,-0.328559,0.879015,2.253122,-0.446551,-0.445505,-0.451563,-0.708485,-0.434148,2.262749,-1.451099,-0.308738,1.039891,-0.873034


In [13]:
y_train


4177     1
1616     1
2775     1
10272    0
6836     0
        ..
7813     0
10955    0
905      1
5192     1
235      1
Name: Reached.on.Time_Y.N, Length: 7699, dtype: int64

**Training**


In [14]:
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier(),
    "                               XGBoost": XGBClassifier(eval_metric='mlogloss'),
    "                              LightGBM": LGBMClassifier(),
    "                              CatBoost": CatBoostClassifier(verbose=0)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

                   Logistic Regression trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                        Neural Network trained.
                         Random Forest trained.
                     Gradient Boosting trained.
                               XGBoost trained.
                              LightGBM trained.
                              CatBoost trained.


**Results**

In [15]:
for name, model in models.items():
    print(name + ": {:.2f}%".format(model.score(X_test, y_test) * 100))

                   Logistic Regression: 63.55%
                   K-Nearest Neighbors: 63.67%
                         Decision Tree: 65.33%
Support Vector Machine (Linear Kernel): 63.70%
   Support Vector Machine (RBF Kernel): 65.12%
                        Neural Network: 64.06%
                         Random Forest: 64.94%
                     Gradient Boosting: 68.03%
                               XGBoost: 68.58%
                              LightGBM: 66.91%
                              CatBoost: 66.09%
