In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
#import seaborn as sns


from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
import warnings
warnings.simplefilter(action='ignore')

In [3]:
# train dataset
df = pd.read_csv("FMCG_data.csv")

# test dataset
df_test = pd.read_csv("FMCG_data.csv")

In [4]:
df.head()

Unnamed: 0,Date,Ware_house_ID,WH_Manager_ID,Location_type,WH_capacity_size,zone,WH_regional_zone,Refill_Requests,transport_issue,No_of_Competitor,...,electric_supply,dist_from_hub,workers_num,wh_est_year,storage_issue_reported,temp_reg_mach,approved_wh_govt_certificate,wh_breakdown,govt_check,product_wg_ton
0,01-01-2023,WH_100000,EID_50000,Urban,Small,West,Zone 6,3,1,2,...,1,91,29.0,,13,0.0,A,5.0,15.0,17115.0
1,01-01-2023,WH_100001,EID_50001,Rural,Large,North,Zone 5,0,0,4,...,1,210,31.0,,4,0.0,A,3.0,17.0,5074.0
2,01-01-2023,WH_100002,EID_50002,Rural,Mid,South,Zone 2,1,0,4,...,0,161,37.0,,17,0.0,A,6.0,22.0,23137.0
3,01-01-2023,WH_100003,EID_50003,Rural,Mid,North,Zone 3,7,4,2,...,0,103,21.0,,17,1.0,A+,3.0,27.0,22115.0
4,01-01-2023,WH_100004,EID_50004,Rural,Large,North,Zone 5,3,1,2,...,1,112,25.0,2009.0,18,0.0,C,6.0,24.0,24071.0


In [5]:

df.nunique().sort_values(ascending=False)

product_wg_ton                  65179
WH_Manager_ID                   25000
Ware_house_ID                   25000
No_of_retailers                  9068
dist_from_hub                     217
Number_of_distributors            138
workers_num                       122
storage_issue_reported             73
govt_check                         70
wh_breakdown                       47
wh_est_year                        28
Refill_Requests                    27
transport_issue                    23
No_of_Competitor                   23
Date                               14
approved_wh_govt_certificate        6
WH_regional_zone                    6
zone                                4
WH_capacity_size                    3
Location_type                       2
flood_proof                         2
flood_impacted                      2
temp_reg_mach                       2
Warehouse_Ownership                 2
electric_supply                     2
dtype: int64

In [6]:
df.drop(columns=['Ware_house_ID','WH_Manager_ID','Date'], inplace = True)
df['workers_num'].median()
df['approved_wh_govt_certificate'].mode()
df['approved_wh_govt_certificate'].fillna(df['approved_wh_govt_certificate'].mode()[0], inplace=True)
df.isnull().sum()
df.nunique().sort_values(ascending= True)

Location_type                       2
temp_reg_mach                       2
electric_supply                     2
flood_proof                         2
Warehouse_Ownership                 2
flood_impacted                      2
WH_capacity_size                    3
zone                                4
WH_regional_zone                    6
approved_wh_govt_certificate        6
No_of_Competitor                   23
transport_issue                    23
Refill_Requests                    27
wh_est_year                        28
wh_breakdown                       47
govt_check                         70
storage_issue_reported             73
workers_num                       122
Number_of_distributors            138
dist_from_hub                     217
No_of_retailers                  9068
product_wg_ton                  65179
dtype: int64

In [7]:
def count_outliers(df):
    outliers={}
    for i in df.columns:
        if pd.api.types.is_numeric_dtype(df[i]):
            q1 = df[i].quantile(0.25)
            q3 = df[i].quantile(0.75)
            iqr = q3 - q1
            lower = q1 - 1.5*iqr
            upper = q3 + 1.5*iqr
            outliers[i] = df[(df[i] < lower) | (df[i] > upper)].shape[0]
        else:
            pass
    return outliers

In [8]:
outlier_counts = count_outliers(df)
outlier_counts

{'Refill_Requests': 4159,
 'transport_issue': 3852,
 'No_of_Competitor': 16980,
 'No_of_retailers': 7980,
 'Number_of_distributors': 494,
 'flood_impacted': 34356,
 'flood_proof': 19125,
 'electric_supply': 0,
 'dist_from_hub': 0,
 'workers_num': 8332,
 'wh_est_year': 0,
 'storage_issue_reported': 932,
 'temp_reg_mach': 0,
 'wh_breakdown': 436,
 'govt_check': 114,
 'product_wg_ton': 1327}

In [9]:
df['product_wg_ton'].corr(df['flood_proof'])
df['product_wg_ton'].corr(df['flood_impacted'])
df.drop(columns=['flood_proof', 'flood_impacted'], inplace = True)

In [10]:
def remove_outliers(df):
    num_df = df.select_dtypes(include=['int', 'float'])
    q1 = num_df.quantile(0.25)
    q3 = num_df.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5*iqr
    upper = q3 + 1.5*iqr
    df_no_outliers = num_df[((num_df >= lower) & (num_df <= upper)).all(axis=1)]
    return df.loc[df_no_outliers.index]

In [11]:
df = remove_outliers(df)
df.shape

(161071, 20)

In [12]:
df_test.nunique().sort_values(ascending=False)

product_wg_ton                  65179
WH_Manager_ID                   25000
Ware_house_ID                   25000
No_of_retailers                  9068
dist_from_hub                     217
Number_of_distributors            138
workers_num                       122
storage_issue_reported             73
govt_check                         70
wh_breakdown                       47
wh_est_year                        28
Refill_Requests                    27
transport_issue                    23
No_of_Competitor                   23
Date                               14
approved_wh_govt_certificate        6
WH_regional_zone                    6
zone                                4
WH_capacity_size                    3
Location_type                       2
flood_proof                         2
flood_impacted                      2
temp_reg_mach                       2
Warehouse_Ownership                 2
electric_supply                     2
dtype: int64

In [13]:
df_test.drop(columns=['Ware_house_ID', 'WH_Manager_ID'], inplace = True)
print(len(df_test[df_test.duplicated]))

0


In [14]:
# print percentage of null values in each column

null_values_features = [i for i in df_test.columns if df_test[i].isnull().sum()>1]
for i in null_values_features:
    print(i, np.round(df_test[i].isnull().mean()*100, 2), "% missing values")

workers_num 0.28 % missing values
wh_est_year 47.52 % missing values
approved_wh_govt_certificate 3.63 % missing values


In [15]:
df_test.drop(columns=['wh_est_year'], inplace = True)
df_test['workers_num'].median()
df_test['workers_num'].fillna(df_test['workers_num'].median(), inplace=True)
df_test['approved_wh_govt_certificate'].mode()
df_test['approved_wh_govt_certificate'].fillna(df_test['approved_wh_govt_certificate'].mode()[0], inplace=True)
df_test.isnull().sum()
df_test.nunique().sort_values(ascending= True)

temp_reg_mach                       2
Location_type                       2
electric_supply                     2
flood_proof                         2
flood_impacted                      2
Warehouse_Ownership                 2
WH_capacity_size                    3
zone                                4
WH_regional_zone                    6
approved_wh_govt_certificate        6
Date                               14
No_of_Competitor                   23
transport_issue                    23
Refill_Requests                    27
wh_breakdown                       47
govt_check                         70
storage_issue_reported             73
workers_num                       122
Number_of_distributors            138
dist_from_hub                     217
No_of_retailers                  9068
product_wg_ton                  65179
dtype: int64

In [16]:
for i in df_test.columns[df_test.nunique()<=4]:
    print("Number of unique values in column ", i, ' :', len(df_test[i].unique()))
    print(df_test[i].value_counts())
    print(' ')

Number of unique values in column  Location_type  : 2
Location_type
Rural    321398
Urban     28603
Name: count, dtype: int64
 
Number of unique values in column  WH_capacity_size  : 3
WH_capacity_size
Large    142366
Mid      140280
Small     67355
Name: count, dtype: int64
 
Number of unique values in column  zone  : 4
zone
North    143892
West     111035
South     89068
East       6006
Name: count, dtype: int64
 
Number of unique values in column  Warehouse_Ownership  : 2
Warehouse_Ownership
Company Owned    190092
Rented           159909
Name: count, dtype: int64
 
Number of unique values in column  flood_impacted  : 2
flood_impacted
0    315645
1     34356
Name: count, dtype: int64
 
Number of unique values in column  flood_proof  : 2
flood_proof
0    330876
1     19125
Name: count, dtype: int64
 
Number of unique values in column  electric_supply  : 2
electric_supply
1    229909
0    120092
Name: count, dtype: int64
 
Number of unique values in column  temp_reg_mach  : 3
temp_reg

In [17]:
df_test['product_wg_ton'].corr(df_test['storage_issue_reported'])

0.6887753290004746

In [18]:
df_test.drop(columns=['flood_proof', 'flood_impacted'], inplace = True)

In [19]:
def remove_outliers(df_test):
    num_df = df_test.select_dtypes(include=['int', 'float'])
    q1 = num_df.quantile(0.25)
    q3 = num_df.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5*iqr
    upper = q3 + 1.5*iqr
    df_no_outliers = num_df[((num_df >= lower) & (num_df <= upper)).all(axis=1)]
    return df_test.loc[df_no_outliers.index]

In [20]:
df_test = remove_outliers(df_test)

In [21]:
df_test.shape

(308322, 20)

In [22]:
df_test.columns

Index(['Date', 'Location_type', 'WH_capacity_size', 'zone', 'WH_regional_zone',
       'Refill_Requests', 'transport_issue', 'No_of_Competitor',
       'No_of_retailers', 'Warehouse_Ownership', 'Number_of_distributors',
       'electric_supply', 'dist_from_hub', 'workers_num',
       'storage_issue_reported', 'temp_reg_mach',
       'approved_wh_govt_certificate', 'wh_breakdown', 'govt_check',
       'product_wg_ton'],
      dtype='object')

In [23]:
# Numerical columns & categorical columns in train dataset

num_columns_train=[i for i in df.columns if df[i].dtypes != 'O']
cat_columns_train=[i for i in df.columns if df[i].dtypes == 'O']

# Numerical columns & categorical columns in test dataset

num_columns_test=[i for i in df_test.columns if df_test[i].dtypes != 'O']
cat_columns_test=[i for i in df_test.columns if df_test[i].dtypes == 'O']


In [24]:
for i in cat_columns_train:
    df[i] = LabelEncoder().fit_transform(df[i])

for i in cat_columns_test:
    df_test[i] = LabelEncoder().fit_transform(df_test[i])

In [None]:
df

In [25]:
# display number of unique values in each column
df.nunique().sort_values(ascending=False)

product_wg_ton                  54049
No_of_retailers                  6267
dist_from_hub                     217
Number_of_distributors            115
govt_check                         63
workers_num                        60
storage_issue_reported             57
wh_breakdown                       38
wh_est_year                        28
Refill_Requests                    15
transport_issue                    11
No_of_Competitor                   10
approved_wh_govt_certificate        6
WH_regional_zone                    6
zone                                4
WH_capacity_size                    3
Location_type                       2
temp_reg_mach                       2
Warehouse_Ownership                 2
electric_supply                     2
dtype: int64

In [26]:
columns_with_more_than_two_unique_values_train = df.columns[df.nunique()>2]

In [27]:
columns_with_more_than_two_unique_values_test = df_test.columns[df_test.nunique()>2]

#df['approved_wh_govt_certificate'] = df['approved_wh_govt_certificate'].replace([4,2], [2,5])

In [None]:
#without change

In [None]:
X_train = df.drop(['product_wg_ton','wh_est_year', 'WH_regional_zone'], axis=1)
Y_train = df['product_wg_ton']
X_test = df_test.drop(['product_wg_ton','WH_regional_zone','Date'], axis=1)
X_test.head()

Y_test = df_test['product_wg_ton']
Y_test

In [None]:
X_test = df_test.drop(['product_wg_ton','WH_regional_zone','Date'], axis=1)
X_test

In [None]:
LRmodel = LinearRegression()
LRmodel.fit(X_train, Y_train)
LR_y_predicted_nochange = LRmodel.predict(X_test)
LR_y_predicted_nochange

In [None]:
Y_test

In [None]:
predictions_without = pd.DataFrame(LR_y_predicted_nochange, columns =['product_wg_ton_nochange']) 
predictions_without

predictions_without.to_csv('LR_y_predicted_nochange.csv', index=False)
predictions_without

In [28]:
df['Location_type'] = df['Location_type'].replace([0,1])


In [None]:
#df_test['Location_type'] = df_test['Location_type'].replace([0,1])

In [None]:
df['approved_wh_govt_certificate'] = df['approved_wh_govt_certificate'].replace([3,5])
df['approved_wh_govt_certificate'] = df['approved_wh_govt_certificate'].replace([4,2])
df

In [None]:
# 1.Actual - Y_test
# 2.Without change predictions 
# 3.With change predictions 


In [29]:

X_train = df.drop(['product_wg_ton','wh_est_year', 'WH_regional_zone'], axis=1)
Y_train = df['product_wg_ton']
X_test = df_test.drop(['product_wg_ton','WH_regional_zone','Date'], axis=1)
X_test.head()

Y_test = df_test['product_wg_ton']
Y_test

0         17115.0
1          5074.0
2         23137.0
3         22115.0
4         24071.0
           ...   
349995    52057.0
349996    37853.0
349997    65888.0
349999    58351.0
350000    50619.0
Name: product_wg_ton, Length: 308322, dtype: float64

In [None]:
Y_train 
Y_test

In [30]:
X_test = df_test.drop(['product_wg_ton','WH_regional_zone','Date'], axis=1)
X_test



Unnamed: 0,Location_type,WH_capacity_size,zone,Refill_Requests,transport_issue,No_of_Competitor,No_of_retailers,Warehouse_Ownership,Number_of_distributors,electric_supply,dist_from_hub,workers_num,storage_issue_reported,temp_reg_mach,approved_wh_govt_certificate,wh_breakdown,govt_check
0,1,2,3,3,1,2,4651,1,24,1,91,29.0,13,0.0,1,5.0,15.0
1,0,0,1,0,0,4,6217,0,47,1,210,31.0,4,0.0,1,3.0,17.0
2,0,1,2,1,0,4,4306,0,64,0,161,37.0,17,0.0,1,6.0,22.0
3,0,1,1,7,4,2,6000,1,50,0,103,21.0,17,1.0,2,3.0,27.0
4,0,0,1,3,1,2,4740,0,42,1,112,25.0,18,0.0,5,6.0,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349995,0,2,1,11,0,7,6256,1,68,1,142,62.0,8,0.0,4,28.0,38.0
349996,0,1,3,4,7,1,5332,0,85,1,130,39.0,23,0.0,3,21.0,36.0
349997,1,0,2,12,4,0,4044,1,65,1,147,27.0,12,1.0,1,30.0,55.0
349999,0,1,3,3,3,4,4988,0,56,1,239,57.0,22,0.0,4,16.0,41.0


In [31]:
LRmodel = LinearRegression()
LRmodel.fit(X_train, Y_train)
LR_y_predicted_change = LRmodel.predict(X_test)
LR_y_predicted_change

array([19295.10332684, 11702.25755155, 26164.56635558, ...,
       35972.10947639, 35913.51369333, 32379.85968161])

In [32]:
LR_y_predicted_change = LRmodel.predict(X_test)
LR_y_predicted_change


array([19295.10332684, 11702.25755155, 26164.56635558, ...,
       35972.10947639, 35913.51369333, 32379.85968161])

In [33]:
predictions_change = pd.DataFrame(LR_y_predicted_change, columns =['product_wg_ton_change']) 
predictions_change

predictions_change.to_csv('LR_y_predicted_change.csv', index=False)
predictions_change

Unnamed: 0,product_wg_ton_change
0,19295.103327
1,11702.257552
2,26164.566356
3,24359.855996
4,22391.965315
...,...
308317,30781.913165
308318,39138.344046
308319,35972.109476
308320,35913.513693


In [None]:
predictions_1 = pd.DataFrame(LR_y_predicted, columns =['product_wg_ton_pred1']) 
predictions_1

predictions_1.to_csv('predictions_1.csv', index=False)
predictions_1


In [None]:
Y_test
test = pd.DataFrame(Y_test) 
test

test.to_csv('test.csv', index=False)
test

In [None]:
predictions_df = pd.DataFrame(RF_predicted)

# Write the DataFrame to a CSV file
predictions_df.to_csv('predictions_1a.csv', index=False)

In [None]:
predictions_df = pd.DataFrame(predictions_1)

# Write the DataFrame to a CSV file
predictions_df.to_csv('predictions_0.csv', index=False)

In [None]:
# Evaluate  the model

# r2 score
lr_r2 = r2_score(Y_test, LR_y_predicted).round(3)
print('R2 score:  ', lr_r2)

# root mean squared error (RMSE)
lr_rmse = np.sqrt(mean_squared_error(Y_test, LR_y_predicted)).round(3)
print('Root Mean Squared Error:  ', lr_rmse) 

# mean absolute error (MAE)
lr_mae = mean_absolute_error(Y_test, LR_y_predicted).round(3)
print('Mean Absolute Error:  ', lr_mae)


In [None]:

DTmodel = DecisionTreeRegressor(random_state = 0) 


In [None]:
DTmodel.fit(X_train, Y_train)

In [None]:
DT_predicted = DTmodel.predict(X_test)
DT_predicted

In [None]:
predictions_df = pd.DataFrame(DT_predicted)

# Write the DataFrame to a CSV file
predictions_df.to_csv('predictions_2a.csv', index=False)

In [None]:
# Evaluate  the model

# r2 score
dt_r2 = r2_score(Y_test, DT_predicted).round(3)
print('R2 score:  ', dt_r2)

# root mean squared error (RMSE)
dt_rmse = np.sqrt(mean_squared_error(Y_test, DT_predicted)).round(3)
print('Root Mean Squared Error:  ', dt_rmse) 

# mean absolute error (MAE)
dt_mae = mean_absolute_error(Y_test, DT_predicted).round(3)
print('Mean Absolute Error:  ', dt_mae)

In [None]:
#Random Forest

In [None]:
# Initialize the model

RFmodel = RandomForestRegressor(n_estimators = 100, max_depth = 4, random_state = 0) 

In [None]:
# Fit model to the training data

RFmodel.fit(X_train, Y_train)

In [None]:
# Make predictions on the test data

RF_predicted = RFmodel.predict(X_test)
RF_predicted

In [None]:
predictions_df = pd.DataFrame(RF_predicted)

# Write the DataFrame to a CSV file
predictions_df.to_csv('predictions_1a.csv', index=False)

In [None]:
# Evaluate  the model

# r2 score
rf_r2 = r2_score(Y_test, RF_predicted).round(3)
print('R2 score:  ', rf_r2)

# root mean squared error (RMSE)
rf_rmse = np.sqrt(mean_squared_error(Y_test, RF_predicted)).round(3)
print('Root Mean Squared Error:  ', rf_rmse) 

# mean absolute error (MAE)
rf_mae = mean_absolute_error(Y_test, RF_predicted).round(3)
print('Mean Absolute Error:  ', rf_mae)


In [None]:
# Initialize the model

GBmodel = GradientBoostingRegressor()

In [None]:
# Fit model to the training data

GBmodel.fit(X_train,Y_train)

In [None]:
# Make predictions on the test data

GB_predicted = GBmodel.predict(X_test)
GB_predicted

In [None]:
predictions_df = pd.DataFrame(GB_predicted)

# Write the DataFrame to a CSV file
predictions_df.to_csv('predictions_1a.csv', index=False)

In [None]:
# Evaluate  the model

# r2 score
gb_r2 = r2_score(Y_test, GB_predicted).round(3)
print('R2 score:  ', gb_r2)

# root mean squared error (RMSE)
gb_rmse = np.sqrt(mean_squared_error(Y_test, GB_predicted)).round(3)
print('Root Mean Squared Error:  ', gb_rmse) 

# mean absolute error (MAE)
gb_mae = mean_absolute_error(Y_test, GB_predicted).round(3)
print('Mean Absolute Error:  ', gb_mae)

In [None]:
predictions = model.predict(data)

# Create a DataFrame from the predictions
predictions_df = pd.DataFrame(predictions)

# Write the DataFrame to a CSV file
predictions_df.to_csv('predictions.csv', index=False)

In [None]:
GB_predicted

In [None]:
predictions_2.to_csv('predictions_2.csv', index=False)

In [None]:
d = pd.read_csv("df_test1_pred.csv")
d

In [None]:
df_merged2 = pd.concat([d, predictions_2], ignore_index=True, sort=False)
df_merged2.head()

In [None]:
workers_num                        990
wh_est_year                     166335
temp_reg_mach                        1
temp_reg_mach
wh_breakdown                         1
govt_check                           1

In [None]:
df_cert_test = df_cert_test['product_wg_ton'].dropna(how='all')

In [None]:
df_cert_test.isnull().sum()

In [None]:
df_cert.head()

In [None]:
X_train = df_cert
Y_train = df_cert['product_wg_ton']

In [None]:
X_train.head()
Y_train 

In [None]:
Y_train.head()

In [None]:
Y_train.shape

In [None]:
df_cert_test

In [None]:
X_test = n_df_test
Y_test = n_df_test['product_wg_ton']

Y_test
X_test 

In [None]:
LRmodel = LinearRegression()

In [None]:

LRmodel.fit(X_train, Y_train)

In [None]:
LR_predicted = LRmodel.predict(X_test)
LR_predicted