## Importing Libraries and Datasets

### Import libraries

In [1]:
import numpy as np
import pandas as pd
from statistics import mean

In [2]:
from sklearn.preprocessing import MultiLabelBinarizer

### Explore Data

In [3]:
Train_set = pd.read_excel('Data_Train.xlsx')
print(Train_set.shape)
Train_set.head()

(11094, 9)


Unnamed: 0,Restaurant,Location,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Delivery_Time
0,ID_6321,"FTI College, Law College Road, Pune","Fast Food, Rolls, Burger, Salad, Wraps",₹200,₹50,3.5,12,4,30 minutes
1,ID_2882,"Sector 3, Marathalli","Ice Cream, Desserts",₹100,₹50,3.5,11,4,30 minutes
2,ID_1595,Mumbai Central,"Italian, Street Food, Fast Food",₹150,₹50,3.6,99,30,65 minutes
3,ID_5929,"Sector 1, Noida","Mughlai, North Indian, Chinese",₹250,₹99,3.7,176,95,30 minutes
4,ID_6123,"Rmz Centennial, I Gate, Whitefield","Cafe, Beverages",₹200,₹99,3.2,521,235,65 minutes


In [4]:
Test_set = pd.read_excel('Data_Test.xlsx')
print(Test_set.shape)
Test_set.head()

(2774, 8)


Unnamed: 0,Restaurant,Location,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews
0,ID_2842,"Mico Layout, Stage 2, BTM Layout,Bangalore","North Indian, Chinese, Assamese",₹350,₹50,4.2,361,225
1,ID_730,"Mico Layout, Stage 2, BTM Layout,Bangalore","Biryani, Kebab",₹100,₹50,NEW,-,-
2,ID_4620,"Sector 1, Noida",Fast Food,₹100,₹50,3.6,36,16
3,ID_5470,"Babarpur, New Delhi, Delhi","Mithai, North Indian, Chinese, Fast Food, Sout...",₹200,₹50,3.6,66,33
4,ID_3249,"Sector 1, Noida","Chinese, Fast Food",₹150,₹50,2.9,38,14


In [5]:
Sample = pd.read_excel('Sample_Submission.xlsx')
Sample.head(2)

Unnamed: 0,Delivery_Time
0,120 minutes
1,20 minutes


In [6]:
Train_set.nunique()

Restaurant       7480
Location           35
Cuisines         2179
Average_Cost       26
Minimum_Order      18
Rating             33
Votes            1103
Reviews           761
Delivery_Time       7
dtype: int64

In [7]:
Test_set.nunique()

Restaurant       2401
Location           35
Cuisines          881
Average_Cost       19
Minimum_Order       9
Rating             30
Votes             580
Reviews           392
dtype: int64

In [8]:
X_train = Train_set.iloc[0:, :8]

In [9]:
y_train = Train_set.iloc[0:, 8]

In [10]:
X_train.head()

Unnamed: 0,Restaurant,Location,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews
0,ID_6321,"FTI College, Law College Road, Pune","Fast Food, Rolls, Burger, Salad, Wraps",₹200,₹50,3.5,12,4
1,ID_2882,"Sector 3, Marathalli","Ice Cream, Desserts",₹100,₹50,3.5,11,4
2,ID_1595,Mumbai Central,"Italian, Street Food, Fast Food",₹150,₹50,3.6,99,30
3,ID_5929,"Sector 1, Noida","Mughlai, North Indian, Chinese",₹250,₹99,3.7,176,95
4,ID_6123,"Rmz Centennial, I Gate, Whitefield","Cafe, Beverages",₹200,₹99,3.2,521,235


In [11]:
y_train.head()

0    30 minutes
1    30 minutes
2    65 minutes
3    30 minutes
4    65 minutes
Name: Delivery_Time, dtype: object

In [12]:
def non_numerals(series):
    non_numerals = []
    for i in series.unique():
        try:
            i = float(i)
        except:
            non_numerals.append(i)
    return non_numerals

In [13]:
print('Average_Cost\n',non_numerals(X_train['Average_Cost']))
print('Minimum_Order\n',non_numerals(X_train['Minimum_Order']))
print('Rating\n',non_numerals(X_train['Rating']))
print('Votes\n',non_numerals(X_train['Votes']))
print('Reviews\n',non_numerals(X_train['Reviews']))

Average_Cost
 ['₹200', '₹100', '₹150', '₹250', '₹650', '₹350', '₹800', '₹50', '₹400', '₹600', '₹300', '₹750', '₹450', '₹550', '₹1,000', '₹500', '₹900', '₹1,200', '₹950', '₹850', '₹700', '₹1,150', 'for', '₹1,100', '₹1,400', '₹2,050']
Minimum_Order
 ['₹50', '₹99', '₹0', '₹200', '₹450', '₹350', '₹79', '₹400', '₹199', '₹500', '₹250', '₹150', '₹90', '₹299', '₹300', '₹240', '₹89', '₹59']
Rating
 ['-', 'NEW', 'Opening Soon', 'Temporarily Closed']
Votes
 ['-']
Reviews
 ['-']


In [14]:
Combined_data = X_train.append(Test_set, ignore_index = True)
print(Combined_data.shape)
Combined_data.head()

(13868, 8)


Unnamed: 0,Restaurant,Location,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews
0,ID_6321,"FTI College, Law College Road, Pune","Fast Food, Rolls, Burger, Salad, Wraps",₹200,₹50,3.5,12,4
1,ID_2882,"Sector 3, Marathalli","Ice Cream, Desserts",₹100,₹50,3.5,11,4
2,ID_1595,Mumbai Central,"Italian, Street Food, Fast Food",₹150,₹50,3.6,99,30
3,ID_5929,"Sector 1, Noida","Mughlai, North Indian, Chinese",₹250,₹99,3.7,176,95
4,ID_6123,"Rmz Centennial, I Gate, Whitefield","Cafe, Beverages",₹200,₹99,3.2,521,235


In [15]:
def clean_record(data):
    
    data['Average_Cost'] = data['Average_Cost'].apply(lambda x: x.replace('₹','').replace(',','').replace('for', '0'))
    data['Average_Cost'] = pd.to_numeric(data['Average_Cost'])
    
    data['Minimum_Order'] = data['Minimum_Order'].apply(lambda x: x.replace('₹',''))
    data['Minimum_Order'] = pd.to_numeric(data['Minimum_Order'])
    
    data['Rating'] = data['Rating'].apply(lambda x: x.replace('Opening Soon','0').replace('NEW','0').replace('Temporarily Closed','0').replace('-','0'))
    data['Rating'] = pd.to_numeric(data['Rating'])
    
    data['Votes'] = data['Votes'].apply(lambda x: x.replace('-','0'))
    data['Votes'] = pd.to_numeric(data['Votes'])
    
    data['Reviews'] = data['Reviews'].apply(lambda x: x.replace('-','0'))
    data['Reviews'] = pd.to_numeric(data['Reviews'])
    
    return data

In [16]:
def clean_cuisines(data):
    
    data['Cuisines'] = data['Cuisines'].str.split(',',expand=False)
    mlb = MultiLabelBinarizer()

    data = data.join(pd.DataFrame(mlb.fit_transform(data.pop('Cuisines')),
                               columns=mlb.classes_))
    return data

In [17]:
def encoding(data):
    from sklearn.preprocessing import LabelEncoder

    label_encoder = LabelEncoder()
    data['Location']= label_encoder.fit_transform(data['Location'])
    label_encoder1 = LabelEncoder()
    data['Restaurant'] = label_encoder1.fit_transform(data['Restaurant'])
    return data

In [18]:
def clean_data(data):
    data = clean_record(data)
    data = clean_cuisines(data)
    data = encoding(data)
    
    return data

In [19]:
cleaned_combined_data = clean_data(Combined_data)
print(cleaned_combined_data.shape)
cleaned_combined_data.head()

(13868, 188)


Unnamed: 0,Restaurant,Location,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Afghan,American,Andhra,...,Steak,Street Food,Sushi,Tamil,Tea,Thai,Tibetan,Turkish,Vietnamese,Wraps
0,5915,10,200,50,3.5,12,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2093,30,100,50,3.5,11,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,663,19,150,50,3.6,99,30,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5478,28,250,99,3.7,176,95,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5695,26,200,99,3.2,521,235,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
X_train = cleaned_combined_data.iloc[:11094,:]
test = cleaned_combined_data.iloc[11094:,:]

In [21]:
def Del_time(data):
    data = data.apply(lambda x: x.replace(' minutes',''))
    data = pd.to_numeric(data)
    
    return data

In [22]:
print(X_train.shape)
X_train.head()

(11094, 188)


Unnamed: 0,Restaurant,Location,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Afghan,American,Andhra,...,Steak,Street Food,Sushi,Tamil,Tea,Thai,Tibetan,Turkish,Vietnamese,Wraps
0,5915,10,200,50,3.5,12,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2093,30,100,50,3.5,11,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,663,19,150,50,3.6,99,30,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5478,28,250,99,3.7,176,95,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5695,26,200,99,3.2,521,235,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
y_train = Del_time(y_train)
print(y_train.shape)
y_train.head()

(11094,)


0    30
1    30
2    65
3    30
4    65
Name: Delivery_Time, dtype: int64

In [24]:
print(test.shape)
test.head()

(2774, 188)


Unnamed: 0,Restaurant,Location,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Afghan,American,Andhra,...,Steak,Street Food,Sushi,Tamil,Tea,Thai,Tibetan,Turkish,Vietnamese,Wraps
11094,2049,17,350,50,4.2,361,225,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11095,7002,17,100,50,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11096,4025,28,100,50,3.6,36,16,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11097,4969,1,200,50,3.6,66,33,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11098,2501,28,150,50,2.9,38,14,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
test = test.reset_index(drop=True)

In [26]:
test.head()

Unnamed: 0,Restaurant,Location,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Afghan,American,Andhra,...,Steak,Street Food,Sushi,Tamil,Tea,Thai,Tibetan,Turkish,Vietnamese,Wraps
0,2049,17,350,50,4.2,361,225,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7002,17,100,50,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4025,28,100,50,3.6,36,16,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4969,1,200,50,3.6,66,33,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2501,28,150,50,2.9,38,14,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
X_train = X_train.iloc[:,:].values
test = test.iloc[:,:].values
y_train = y_train.values

In [28]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
test = sc.transform(test)
y_train = y_train.reshape((len(y_train), 1)) 
y_train = sc.fit_transform(y_train)
y_train = y_train.ravel()

In [29]:
from sklearn.model_selection import train_test_split

train_x, val_x, train_y, val_y = train_test_split(X_train,y_train, test_size = 0.2, random_state = 123)

In [30]:
print(train_x.shape)
print(train_y.shape)
print(val_x.shape)
print(val_y.shape)

(8875, 188)
(8875,)
(2219, 188)
(2219,)


## Model Building

### SVR

In [31]:
from sklearn.svm import SVR
regressor = SVR(kernel='rbf')
regressor.fit(train_x, train_y)

y_pred = sc.inverse_transform(regressor.predict(val_x))
y_true = sc.inverse_transform(val_y)

error = np.square(np.log10(y_pred +1) - np.log10(y_true +1)).mean() ** 0.5
score = 1 - error

print("RMLSE Score = ", score)



RMLSE Score =  0.8986460709896282


### Random Forest Regressor

In [32]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=300, random_state=0)
regressor.fit(train_x, train_y)

y_pred = sc.inverse_transform(regressor.predict(val_x))
y_true = sc.inverse_transform(val_y)

error = np.square(np.log10(y_pred +1) - np.log10(y_true +1)).mean() ** 0.5
score = 1 - error

print("RMLSE Score = ", score)

RMLSE Score =  0.9130707511757369


### Decision Tree Regressor

In [33]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(train_x, train_y)

y_pred = sc.inverse_transform(regressor.predict(val_x))
y_true = sc.inverse_transform(val_y)

error = np.square(np.log10(y_pred +1) - np.log10(y_true +1)).mean() ** 0.5
score = 1 - error

print("RMLSE Score = ", score)

RMLSE Score =  0.8820482025665342


## Final Submissions

### RandomForestRegressor

In [37]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=300, random_state=0)
model.fit(X_train, y_train)

y_pred = sc.inverse_transform(model.predict(test))

In [35]:
y_pred

array([30., 65., 30., ..., 30., 30., 45.])

In [40]:
pred_clf = pd.DataFrame(y_pred, columns = ['Delivery_Time'])
pred_clf = pred_clf.astype(int)
pred_clf['Delivery_Time'] = pred_clf['Delivery_Time'].astype(str) + ' minutes'
pred_clf.to_excel("Rand_forest_sol2.xlsx", index = False )

### Neural Networks

In [29]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM

In [30]:
model = Sequential()

In [35]:
model.add(Dense(units=100, activation='relu', input_dim=188))

In [37]:
model.add(Dense(1, activation='sigmoid'))

In [38]:
model.compile(optimizer = 'adam', loss='mean_squared_error', metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, )