# Ship Duration Prediction

# Libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

# Data

In [2]:
df=pd.read_csv('ship_data.csv')

In [3]:
df

Unnamed: 0,CPU_Brand,CPU_Model,GPU_Brand,GPU_Model,OS_Name,HDD_Storage,Name,SSD_Storage,Weight,RAM,Screen_size,Quantity,Total_Price,Priority,Ship_Duration
0,AMD,A10-Series 9600P,AMD,R5 430,Windows,1000,Zanjan,0,2.40,6,15.6,1,582000,C,9
1,AMD,A10-Series 9600P,AMD,R5 430,Windows,1000,Zanjan,0,2.40,6,15.6,1,600000,L,5
2,AMD,A10-Series 9600P,AMD,R5 430,Windows,1000,Isfahan,0,2.40,6,15.6,1,650000,M,2
3,AMD,A10-Series 9600P,AMD,R5 430,Windows,1000,Tehran,0,2.40,6,15.6,1,650000,C,7
4,AMD,A10-Series 9600P,AMD,R5 430,Windows,1000,Isfahan,0,2.40,6,15.6,1,650000,C,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,Intel,Atom x5-Z8550,Intel,Graphics 400,Windows,0,Isfahan,64,0.69,4,10.1,1,1150000,C,8
29996,Intel,Atom x5-Z8550,Intel,Graphics 400,Windows,0,Zanjan,64,0.69,4,10.1,1,1200000,H,1
29997,Intel,Atom x5-Z8550,Intel,Graphics 400,Windows,0,Kermanshah,64,0.69,4,10.1,1,1200000,M,3
29998,Intel,Atom x5-Z8550,Intel,Graphics 400,Windows,0,Tehran,64,0.69,4,10.1,1,1200000,M,3


# Preprocessing

In [4]:
df.isnull().sum()

CPU_Brand        0
CPU_Model        0
GPU_Brand        0
GPU_Model        0
OS_Name          0
HDD_Storage      0
Name             0
SSD_Storage      0
Weight           0
RAM              0
Screen_size      0
Quantity         0
Total_Price      0
Priority         0
Ship_Duration    0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   CPU_Brand      30000 non-null  object 
 1   CPU_Model      30000 non-null  object 
 2   GPU_Brand      30000 non-null  object 
 3   GPU_Model      30000 non-null  object 
 4   OS_Name        30000 non-null  object 
 5   HDD_Storage    30000 non-null  int64  
 6   Name           30000 non-null  object 
 7   SSD_Storage    30000 non-null  int64  
 8   Weight         30000 non-null  float64
 9   RAM            30000 non-null  int64  
 10  Screen_size    30000 non-null  float64
 11  Quantity       30000 non-null  int64  
 12  Total_Price    30000 non-null  int64  
 13  Priority       30000 non-null  object 
 14  Ship_Duration  30000 non-null  int64  
dtypes: float64(2), int64(6), object(7)
memory usage: 3.4+ MB


In [6]:
df.corr()

  df.corr()


Unnamed: 0,HDD_Storage,SSD_Storage,Weight,RAM,Screen_size,Quantity,Total_Price,Ship_Duration
HDD_Storage,1.0,-0.422012,0.38853,0.328885,0.334264,0.007066,0.10054,0.010218
SSD_Storage,-0.422012,1.0,0.141896,0.518079,0.034198,-0.008466,0.176529,-0.003482
Weight,0.38853,0.141896,1.0,0.446906,0.873944,0.008334,0.232653,0.005353
RAM,0.328885,0.518079,0.446906,1.0,0.208364,-0.001036,0.277588,0.003948
Screen_size,0.334264,0.034198,0.873944,0.208364,1.0,0.010933,0.134081,0.002749
Quantity,0.007066,-0.008466,0.008334,-0.001036,0.010933,1.0,0.220255,0.013021
Total_Price,0.10054,0.176529,0.232653,0.277588,0.134081,0.220255,1.0,0.006509
Ship_Duration,0.010218,-0.003482,0.005353,0.003948,0.002749,0.013021,0.006509,1.0


In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

object_cols = ['CPU_Brand', 'CPU_Model', 'GPU_Brand', 'GPU_Model', 'OS_Name', 'Name', 'Priority']

for col in object_cols:
    original_values = df[col].copy()
    
    df[col] = le.fit_transform(df[col])
    
    encoded_df = pd.DataFrame({
        'Original': original_values,
        'Encoded': df[col]
    })
    
    encoded_df = encoded_df.drop_duplicates().set_index('Original')
    
    print(f"{col}:\n")
    print(encoded_df)
    print("\n" + "="*50 + "\n")


CPU_Brand:

          Encoded
Original         
AMD             0
Intel           1


CPU_Model:

                      Encoded
Original                     
A10-Series 9600P            0
A10-Series 9620P            1
A10-Series A10-9620P        2
A12-Series 9700P            3
A12-Series 9720P            4
A4-Series 7210              5
A6-Series 7310              6
A6-Series 9220              7
A6-Series A6-9220           8
A8-Series 7410              9
A9-Series 9410             10
A9-Series 9420             11
A9-Series A9-9420          12
E-Series 6110              16
E-Series 7110              17
E-Series 9000              18
E-Series 9000e             19
E-Series E2-6110           20
E-Series E2-9000           21
E-Series E2-9000e          22
FX 8800P                   23
FX 9830P                   24
Ryzen 1600                 25
Ryzen 1700                 26
Atom x5-Z8300              14
Atom X5-Z8350              13
Atom x5-Z8550              15


GPU_Brand:

          Encoded


# Model

In [9]:
X=df.drop(columns=['Ship_Duration'])

In [10]:
y=df['Ship_Duration']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

cm = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix: \n{cm}")
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)

print(report)


Accuracy: 0.338
Confusion Matrix: 
[[158   0   0   0   0   0   0   0   0   0]
 [  0 138 114   0   0   0   0   0   0   0]
 [  0 122 109   0   0   0   0   0   0   0]
 [  0   0   0 111 139 125   0   0   0   0]
 [  0   0   0 159 119 154   0   0   0   0]
 [  0   0   0 126 133 117   0   0   0   0]
 [  0   0   0   0   0   0  81  77  80  75]
 [  0   0   0   0   0   0  73  49  79  74]
 [  0   0   0   0   0   0  73  71  55  71]
 [  0   0   0   0   0   0  66  82  93  77]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       158
           2       0.53      0.55      0.54       252
           3       0.49      0.47      0.48       231
           4       0.28      0.30      0.29       375
           5       0.30      0.28      0.29       432
           6       0.30      0.31      0.30       376
           7       0.28      0.26      0.27       313
           8       0.18      0.18      0.18       275
           9       0.18      0.20      0.19    

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Assuming 'X' is your feature set and 'y' is your target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a RandomForestClassifier
clf = RandomForestClassifier(random_state=42)

# Define the parameter values that should be searched
n_estimators_range = list(range(10, 200, 10))
test_size_range = [i/10.0 for i in range(1, 10)]

# Create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_estimators=n_estimators_range)

# Instantiate the grid
grid = GridSearchCV(clf, param_grid, cv=10, scoring='accuracy')

# Fit the grid with data
grid.fit(X_train, y_train)

# View the complete results
print(grid.cv_results_)

# Examine the best model
print("\nBest score: ", grid.best_score_)
print("Best params: ", grid.best_params_)


{'mean_fit_time': array([0.43909683, 0.88639314, 1.29969075, 1.71575673, 2.1343524 ,
       2.65527778, 2.76138775, 3.03521996, 3.36702988, 3.77867193,
       4.1123008 , 4.97508786, 5.19779298, 5.42239912, 6.0194612 ,
       6.56956351, 6.64901259, 6.79088933, 7.30226097]), 'std_fit_time': array([0.03230573, 0.04126854, 0.05270665, 0.04637605, 0.12456189,
       0.10221218, 0.15193123, 0.0713378 , 0.06047809, 0.09339597,
       0.04687648, 0.19821065, 0.26081039, 0.24433419, 0.32600101,
       0.29969431, 0.35218475, 0.08047419, 0.44674536]), 'mean_score_time': array([0.01232646, 0.02606051, 0.03529384, 0.04705644, 0.05541072,
       0.07492294, 0.07486467, 0.08144407, 0.09287517, 0.10469234,
       0.11478207, 0.13632629, 0.1393487 , 0.14527617, 0.1633378 ,
       0.17826214, 0.17862499, 0.18963766, 0.19947116]), 'std_score_time': array([0.00203393, 0.00328784, 0.00408329, 0.00400026, 0.00763598,
       0.00450042, 0.00735032, 0.00209174, 0.00732487, 0.00733971,
       0.00313297, 0.

In [16]:
data=pd.read_csv('ship_data.csv')

In [17]:
prediction_column=clf.predict(X)
prediction_column
data['Prediction']=prediction_column
data

Unnamed: 0,CPU_Brand,CPU_Model,GPU_Brand,GPU_Model,OS_Name,HDD_Storage,Name,SSD_Storage,Weight,RAM,Screen_size,Quantity,Total_Price,Priority,Ship_Duration,Prediction
0,AMD,A10-Series 9600P,AMD,R5 430,Windows,1000,Zanjan,0,2.40,6,15.6,1,582000,C,9,9
1,AMD,A10-Series 9600P,AMD,R5 430,Windows,1000,Zanjan,0,2.40,6,15.6,1,600000,L,5,5
2,AMD,A10-Series 9600P,AMD,R5 430,Windows,1000,Isfahan,0,2.40,6,15.6,1,650000,M,2,2
3,AMD,A10-Series 9600P,AMD,R5 430,Windows,1000,Tehran,0,2.40,6,15.6,1,650000,C,7,7
4,AMD,A10-Series 9600P,AMD,R5 430,Windows,1000,Isfahan,0,2.40,6,15.6,1,650000,C,9,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,Intel,Atom x5-Z8550,Intel,Graphics 400,Windows,0,Isfahan,64,0.69,4,10.1,1,1150000,C,8,9
29996,Intel,Atom x5-Z8550,Intel,Graphics 400,Windows,0,Zanjan,64,0.69,4,10.1,1,1200000,H,1,1
29997,Intel,Atom x5-Z8550,Intel,Graphics 400,Windows,0,Kermanshah,64,0.69,4,10.1,1,1200000,M,3,3
29998,Intel,Atom x5-Z8550,Intel,Graphics 400,Windows,0,Tehran,64,0.69,4,10.1,1,1200000,M,3,3


In [18]:
data.to_csv('final_Ship_predicted.csv')