In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
#import seaborn as sns


from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
import warnings
warnings.simplefilter(action='ignore')

In [3]:
# train dataset
df = pd.read_csv("FMCG_data.csv")

# test dataset
df_test = pd.read_csv("FMCG_data.csv")

In [None]:
df

In [4]:

df.nunique().sort_values(ascending=False)

product_wg_ton                  65179
WH_Manager_ID                   25000
Ware_house_ID                   25000
No_of_retailers                  9068
dist_from_hub                     217
Number_of_distributors            138
workers_num                       122
storage_issue_reported             73
govt_check                         70
wh_breakdown                       47
wh_est_year                        28
Refill_Requests                    27
transport_issue                    23
No_of_Competitor                   23
Date                               14
approved_wh_govt_certificate        6
WH_regional_zone                    6
zone                                4
WH_capacity_size                    3
Location_type                       2
flood_proof                         2
flood_impacted                      2
temp_reg_mach                       2
Warehouse_Ownership                 2
electric_supply                     2
dtype: int64

In [5]:
df.drop(columns=['Ware_house_ID','WH_Manager_ID','Date'], inplace = True)

In [6]:
df['workers_num'].median()

37.0

In [7]:
df['approved_wh_govt_certificate'].mode()

0    C
Name: approved_wh_govt_certificate, dtype: object

In [8]:
df['approved_wh_govt_certificate'].fillna(df['approved_wh_govt_certificate'].mode()[0], inplace=True)

In [9]:
df.isnull().sum()

Location_type                        0
WH_capacity_size                     0
zone                                 0
WH_regional_zone                     0
Refill_Requests                      0
transport_issue                      0
No_of_Competitor                     0
No_of_retailers                      0
Warehouse_Ownership                  0
Number_of_distributors               0
flood_impacted                       0
flood_proof                          0
electric_supply                      0
dist_from_hub                        0
workers_num                        990
wh_est_year                     166335
storage_issue_reported               0
temp_reg_mach                        1
approved_wh_govt_certificate         0
wh_breakdown                         1
govt_check                           1
product_wg_ton                       1
dtype: int64

In [10]:
df.nunique().sort_values(ascending= True)

Location_type                       2
temp_reg_mach                       2
electric_supply                     2
flood_proof                         2
Warehouse_Ownership                 2
flood_impacted                      2
WH_capacity_size                    3
zone                                4
WH_regional_zone                    6
approved_wh_govt_certificate        6
No_of_Competitor                   23
transport_issue                    23
Refill_Requests                    27
wh_est_year                        28
wh_breakdown                       47
govt_check                         70
storage_issue_reported             73
workers_num                       122
Number_of_distributors            138
dist_from_hub                     217
No_of_retailers                  9068
product_wg_ton                  65179
dtype: int64

In [11]:
for i in df.columns[df.nunique()<=4]:
    print("Number of unique values in column ", i, ' :', len(df[i].unique()))
    print(df[i].value_counts())
    print(' ')

Number of unique values in column  Location_type  : 2
Location_type
Rural    321398
Urban     28603
Name: count, dtype: int64
 
Number of unique values in column  WH_capacity_size  : 3
WH_capacity_size
Large    142366
Mid      140280
Small     67355
Name: count, dtype: int64
 
Number of unique values in column  zone  : 4
zone
North    143892
West     111035
South     89068
East       6006
Name: count, dtype: int64
 
Number of unique values in column  Warehouse_Ownership  : 2
Warehouse_Ownership
Company Owned    190092
Rented           159909
Name: count, dtype: int64
 
Number of unique values in column  flood_impacted  : 2
flood_impacted
0    315645
1     34356
Name: count, dtype: int64
 
Number of unique values in column  flood_proof  : 2
flood_proof
0    330876
1     19125
Name: count, dtype: int64
 
Number of unique values in column  electric_supply  : 2
electric_supply
1    229909
0    120092
Name: count, dtype: int64
 
Number of unique values in column  temp_reg_mach  : 3
temp_reg

In [12]:
def count_outliers(df):
    outliers={}
    for i in df.columns:
        if pd.api.types.is_numeric_dtype(df[i]):
            q1 = df[i].quantile(0.25)
            q3 = df[i].quantile(0.75)
            iqr = q3 - q1
            lower = q1 - 1.5*iqr
            upper = q3 + 1.5*iqr
            outliers[i] = df[(df[i] < lower) | (df[i] > upper)].shape[0]
        else:
            pass
    return outliers

In [13]:
outlier_counts = count_outliers(df)
outlier_counts

{'Refill_Requests': 4159,
 'transport_issue': 3852,
 'No_of_Competitor': 16980,
 'No_of_retailers': 7980,
 'Number_of_distributors': 494,
 'flood_impacted': 34356,
 'flood_proof': 19125,
 'electric_supply': 0,
 'dist_from_hub': 0,
 'workers_num': 8332,
 'wh_est_year': 0,
 'storage_issue_reported': 932,
 'temp_reg_mach': 0,
 'wh_breakdown': 436,
 'govt_check': 114,
 'product_wg_ton': 1327}

In [14]:
df['product_wg_ton'].corr(df['flood_proof'])

-0.0013781060322405482

In [15]:
df['product_wg_ton'].corr(df['flood_impacted'])

-0.0022768441325800573

In [16]:
df.drop(columns=['flood_proof', 'flood_impacted'], inplace = True)

In [17]:
def remove_outliers(df):
    num_df = df.select_dtypes(include=['int', 'float'])
    q1 = num_df.quantile(0.25)
    q3 = num_df.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5*iqr
    upper = q3 + 1.5*iqr
    df_no_outliers = num_df[((num_df >= lower) & (num_df <= upper)).all(axis=1)]
    return df.loc[df_no_outliers.index]

In [18]:
df = remove_outliers(df)
df.shape

(161071, 20)

In [19]:
df_test.nunique().sort_values(ascending=False)

product_wg_ton                  65179
WH_Manager_ID                   25000
Ware_house_ID                   25000
No_of_retailers                  9068
dist_from_hub                     217
Number_of_distributors            138
workers_num                       122
storage_issue_reported             73
govt_check                         70
wh_breakdown                       47
wh_est_year                        28
Refill_Requests                    27
transport_issue                    23
No_of_Competitor                   23
Date                               14
approved_wh_govt_certificate        6
WH_regional_zone                    6
zone                                4
WH_capacity_size                    3
Location_type                       2
flood_proof                         2
flood_impacted                      2
temp_reg_mach                       2
Warehouse_Ownership                 2
electric_supply                     2
dtype: int64

In [20]:
df_test.drop(columns=['Ware_house_ID', 'WH_Manager_ID'], inplace = True)

In [21]:
print(len(df_test[df_test.duplicated]))

0


In [22]:
# print percentage of null values in each column

null_values_features = [i for i in df_test.columns if df_test[i].isnull().sum()>1]
for i in null_values_features:
    print(i, np.round(df_test[i].isnull().mean()*100, 2), "% missing values")

workers_num 0.28 % missing values
wh_est_year 47.52 % missing values
approved_wh_govt_certificate 3.63 % missing values


In [23]:

df_test.drop(columns=['wh_est_year'], inplace = True)

In [24]:
df_test['workers_num'].median()

37.0

In [25]:
df_test['workers_num'].fillna(df_test['workers_num'].median(), inplace=True)

In [26]:
df_test['approved_wh_govt_certificate'].mode()

0    C
Name: approved_wh_govt_certificate, dtype: object

In [27]:
df_test['approved_wh_govt_certificate'].fillna(df_test['approved_wh_govt_certificate'].mode()[0], inplace=True)

In [28]:
df_test.isnull().sum()

Date                            0
Location_type                   0
WH_capacity_size                0
zone                            0
WH_regional_zone                0
Refill_Requests                 0
transport_issue                 0
No_of_Competitor                0
No_of_retailers                 0
Warehouse_Ownership             0
Number_of_distributors          0
flood_impacted                  0
flood_proof                     0
electric_supply                 0
dist_from_hub                   0
workers_num                     0
storage_issue_reported          0
temp_reg_mach                   1
approved_wh_govt_certificate    0
wh_breakdown                    1
govt_check                      1
product_wg_ton                  1
dtype: int64

In [29]:
df_test.nunique().sort_values(ascending= True)

temp_reg_mach                       2
Location_type                       2
electric_supply                     2
flood_proof                         2
flood_impacted                      2
Warehouse_Ownership                 2
WH_capacity_size                    3
zone                                4
WH_regional_zone                    6
approved_wh_govt_certificate        6
Date                               14
No_of_Competitor                   23
transport_issue                    23
Refill_Requests                    27
wh_breakdown                       47
govt_check                         70
storage_issue_reported             73
workers_num                       122
Number_of_distributors            138
dist_from_hub                     217
No_of_retailers                  9068
product_wg_ton                  65179
dtype: int64

In [30]:
for i in df_test.columns[df_test.nunique()<=4]:
    print("Number of unique values in column ", i, ' :', len(df_test[i].unique()))
    print(df_test[i].value_counts())
    print(' ')

Number of unique values in column  Location_type  : 2
Location_type
Rural    321398
Urban     28603
Name: count, dtype: int64
 
Number of unique values in column  WH_capacity_size  : 3
WH_capacity_size
Large    142366
Mid      140280
Small     67355
Name: count, dtype: int64
 
Number of unique values in column  zone  : 4
zone
North    143892
West     111035
South     89068
East       6006
Name: count, dtype: int64
 
Number of unique values in column  Warehouse_Ownership  : 2
Warehouse_Ownership
Company Owned    190092
Rented           159909
Name: count, dtype: int64
 
Number of unique values in column  flood_impacted  : 2
flood_impacted
0    315645
1     34356
Name: count, dtype: int64
 
Number of unique values in column  flood_proof  : 2
flood_proof
0    330876
1     19125
Name: count, dtype: int64
 
Number of unique values in column  electric_supply  : 2
electric_supply
1    229909
0    120092
Name: count, dtype: int64
 
Number of unique values in column  temp_reg_mach  : 3
temp_reg

In [31]:
df_test['product_wg_ton'].corr(df_test['storage_issue_reported'])

0.6887753290004746

In [32]:
df_test.drop(columns=['flood_proof', 'flood_impacted'], inplace = True)

In [33]:
def remove_outliers(df_test):
    num_df = df_test.select_dtypes(include=['int', 'float'])
    q1 = num_df.quantile(0.25)
    q3 = num_df.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5*iqr
    upper = q3 + 1.5*iqr
    df_no_outliers = num_df[((num_df >= lower) & (num_df <= upper)).all(axis=1)]
    return df_test.loc[df_no_outliers.index]

In [34]:
df_test = remove_outliers(df_test)

In [35]:
df_test.shape

(308322, 20)

In [36]:
df_test.columns

Index(['Date', 'Location_type', 'WH_capacity_size', 'zone', 'WH_regional_zone',
       'Refill_Requests', 'transport_issue', 'No_of_Competitor',
       'No_of_retailers', 'Warehouse_Ownership', 'Number_of_distributors',
       'electric_supply', 'dist_from_hub', 'workers_num',
       'storage_issue_reported', 'temp_reg_mach',
       'approved_wh_govt_certificate', 'wh_breakdown', 'govt_check',
       'product_wg_ton'],
      dtype='object')

In [37]:
# Numerical columns & categorical columns in train dataset

num_columns_train=[i for i in df.columns if df[i].dtypes != 'O']
cat_columns_train=[i for i in df.columns if df[i].dtypes == 'O']

# Numerical columns & categorical columns in test dataset

num_columns_test=[i for i in df_test.columns if df_test[i].dtypes != 'O']
cat_columns_test=[i for i in df_test.columns if df_test[i].dtypes == 'O']


In [38]:
for i in cat_columns_train:
    df[i] = LabelEncoder().fit_transform(df[i])

for i in cat_columns_test:
    df_test[i] = LabelEncoder().fit_transform(df_test[i])

In [39]:
# display number of unique values in each column
df.nunique().sort_values(ascending=False)

product_wg_ton                  54049
No_of_retailers                  6267
dist_from_hub                     217
Number_of_distributors            115
govt_check                         63
workers_num                        60
storage_issue_reported             57
wh_breakdown                       38
wh_est_year                        28
Refill_Requests                    15
transport_issue                    11
No_of_Competitor                   10
approved_wh_govt_certificate        6
WH_regional_zone                    6
zone                                4
WH_capacity_size                    3
Location_type                       2
temp_reg_mach                       2
Warehouse_Ownership                 2
electric_supply                     2
dtype: int64

In [40]:
columns_with_more_than_two_unique_values_train = df.columns[df.nunique()>2]

In [41]:
columns_with_more_than_two_unique_values_test = df_test.columns[df_test.nunique()>2]

In [42]:

X_train = df.drop(['product_wg_ton','wh_est_year', 'WH_regional_zone'], axis=1)
Y_train = df['product_wg_ton']

In [None]:
X_train.columns

In [44]:
X_test = df_test.drop(['product_wg_ton','WH_regional_zone','Date'], axis=1)
X_test

Unnamed: 0,Location_type,WH_capacity_size,zone,Refill_Requests,transport_issue,No_of_Competitor,No_of_retailers,Warehouse_Ownership,Number_of_distributors,electric_supply,dist_from_hub,workers_num,storage_issue_reported,temp_reg_mach,approved_wh_govt_certificate,wh_breakdown,govt_check
0,1,2,3,3,1,2,4651,1,24,1,91,29.0,13,0.0,1,5.0,15.0
1,0,0,1,0,0,4,6217,0,47,1,210,31.0,4,0.0,1,3.0,17.0
2,0,1,2,1,0,4,4306,0,64,0,161,37.0,17,0.0,1,6.0,22.0
3,0,1,1,7,4,2,6000,1,50,0,103,21.0,17,1.0,2,3.0,27.0
4,0,0,1,3,1,2,4740,0,42,1,112,25.0,18,0.0,5,6.0,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349995,0,2,1,11,0,7,6256,1,68,1,142,62.0,8,0.0,4,28.0,38.0
349996,0,1,3,4,7,1,5332,0,85,1,130,39.0,23,0.0,3,21.0,36.0
349997,1,0,2,12,4,0,4044,1,65,1,147,27.0,12,1.0,1,30.0,55.0
349999,0,1,3,3,3,4,4988,0,56,1,239,57.0,22,0.0,4,16.0,41.0


In [None]:
X_test.columns

In [None]:
Y_train
X_test 

In [None]:
LRmodel = LinearRegression()

In [None]:

LRmodel.fit(X_train, Y_train)

In [None]:
LR_predicted = LRmodel.predict(X_test)

In [None]:
LR_predicted

In [None]:
LR_predicted.shape

In [None]:
X_test.shape

In [None]:
predictions_1 = pd.DataFrame(LR_predicted, columns =['product_wg_ton_pred']) 
predictions_1

In [None]:
Y_train

In [None]:
predictions_1.to_csv('predictions_1.csv', index=False)


In [None]:
X_test

In [None]:
X_test.to_csv('X_test.csv', index=False)

In [None]:
df_merged = pd.concat([X_test, predictions_1], ignore_index=True, sort=False)
df_merged.head()

In [None]:
df2 = pd.read_csv("WH_opt_pred.csv")

In [None]:
df2


In [None]:
# with changes in Location and approved certificates 

In [None]:
import pandas as pd

In [None]:
df2 = pd.read_csv("WH_opt_pred.csv")

In [None]:
df2

In [None]:
df2['approved_wh_govt_certificate'] = df2['approved_wh_govt_certificate'].replace([5,3], [3, 2])
df2

In [None]:
df2.to_csv('newdata.csv', index=False)

In [None]:
# train dataset
df0 = pd.read_csv("newdata.csv")
df0


In [None]:
df = pd.read_csv("newdata.csv")

# test dataset
df_test = pd.read_csv("newdata.csv")

In [None]:
df_test.shape

In [None]:
df.shape

In [None]:
X_train = df
Y_train = df['product_wg_ton_pred']

In [None]:
X_train.shape

In [None]:
Y_train.shape

In [None]:
X_test = df_test
Y_test = df_test['product_wg_ton_pred']

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
#import seaborn as sns


from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
LRmodel = LinearRegression()

In [None]:

LRmodel.fit(X_train, Y_train)

In [None]:
LR_predicted = LRmodel.predict(X_test)
LR_predicted

In [None]:
predictions_2 = pd.DataFrame(LR_predicted, columns =['product_wg_ton_pred']) 
predictions_2

In [None]:
Y_train

In [None]:
predictions_2.to_csv('predictions_2.csv', index=False)

In [None]:
df_merged2 = pd.concat([X_test_pred, predictions_2], ignore_index=True, sort=False)
df_merged2.head()

In [None]:
# r2 score
lr_r2 = r2_score(Y_test, LR_predicted).round(3)
print('R2 score:  ', lr_r2)

# root mean squared error (RMSE)
lr_rmse = np.sqrt(mean_squared_error(Y_test, LR_predicted)).round(3)
print('Root Mean Squared Error:  ', lr_rmse) 

# mean absolute error (MAE)
lr_mae = mean_absolute_error(Y_test, LR_predicted).round(3)
print('Mean Absolute Error:  ', lr_mae)


In [None]:
LR_predicted.to_csv('newdata.csv', index=False)

In [None]:
DTmodel = DecisionTreeRegressor(random_state = 0) 

In [None]:

DTmodel.fit(X_train, Y_train) 

In [None]:
DT_predicted = DTmodel.predict(X_test)

In [None]:
DT_predicted

In [None]:
# r2 score
dt_r2 = r2_score(Y_test, DT_predicted).round(3)
print('R2 score:  ', dt_r2)

# root mean squared error (RMSE)
dt_rmse = np.sqrt(mean_squared_error(Y_test, DT_predicted)).round(3)
print('Root Mean Squared Error:  ', dt_rmse) 

# mean absolute error (MAE)
dt_mae = mean_absolute_error(Y_test, DT_predicted).round(3)
print('Mean Absolute Error:  ', dt_mae)


In [None]:
RFmodel = RandomForestRegressor(n_estimators = 100, max_depth = 4, random_state = 0) 

In [None]:
RFmodel.fit(X_train, Y_train)

In [None]:
RF_predicted = RFmodel.predict(X_test)
RF_predicted

In [None]:

# r2 score
rf_r2 = r2_score(Y_test, RF_predicted).round(3)
print('R2 score:  ', rf_r2)

# root mean squared error (RMSE)
rf_rmse = np.sqrt(mean_squared_error(Y_test, RF_predicted)).round(3)
print('Root Mean Squared Error:  ', rf_rmse) 

# mean absolute error (MAE)
rf_mae = mean_absolute_error(Y_test, RF_predicted).round(3)
print('Mean Absolute Error:  ', rf_mae)

In [None]:

GBmodel = GradientBoostingRegressor()

In [None]:
GBmodel.fit(X_train,Y_train)

In [None]:
GB_predicted = GBmodel.predict(X_test)

In [None]:
GB_predicted

In [None]:
gb_r2 = r2_score(Y_test, GB_predicted).round(3)
print('R2 score:  ', gb_r2)

# root mean squared error (RMSE)
gb_rmse = np.sqrt(mean_squared_error(Y_test, GB_predicted)).round(3)
print('Root Mean Squared Error:  ', gb_rmse) 

# mean absolute error (MAE)
gb_mae = mean_absolute_error(Y_test, GB_predicted).round(3)
print('Mean Absolute Error:  ', gb_mae)

In [None]:
ax1 = plt.subplot2grid((2, 3), (0, 0))
plt.scatter(Y_test, LR_predicted, color='#ffdab9', alpha=0.3, label='Actual')
plt.plot([min(Y_test), max(Y_test)], [min(Y_test), max(Y_test)], linestyle='--', color='#2f4f4f', label='Predicted')
plt.title('Linear Regression')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.legend()


ax1 = plt.subplot2grid((2, 3), (0, 1))
plt.scatter(Y_test, DT_predicted, color='#ffdab9', alpha=0.3)
plt.plot([min(Y_test), max(Y_test)], [min(Y_test), max(Y_test)], linestyle='--', color='#2f4f4f')
plt.title('Decision Tree')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')



In [None]:
ax1 = plt.subplot2grid((2, 3), (1, 1))
plt.scatter(Y_test, GB_predicted, color='#ffdab9', alpha=0.3)  # alpha controls point transparency for better visualization
plt.plot([min(Y_test), max(Y_test)], [min(Y_test), max(Y_test)], linestyle='--',color='#2f4f4f')
plt.title('Gradient Boosting')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')

plt.show()