<a href="https://colab.research.google.com/github/remart90/Python/blob/master/Restaurant%20Recommendation%20Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# My solution for 'Akeed Restaurant Recommendation Challenge'
# (https://zindi.africa/competitions/akeed-restaurant-recommendation-challenge)
# July-Aug 2020
# Ranked 49/242 at the private leaderboard

# Mounting the Drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# Some constants
pth = '/content/gdrive/My Drive/Akeed/'

use_delayed = False
submission_name = 'submission_1.csv'
n_comp = 5


frac_size = 0.2
rs = 222
rr = 'target'

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import f1_score
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import gc
from sklearn.cluster import MiniBatchKMeans
import datetime
import re

In [None]:
np.random.seed(rs)

In [None]:
def lgbm_val(X_train, X_val, y_train, y_val, cat_features):
    # create dataset
    train = lgb.Dataset(X_train, label = y_train, categorical_feature=cat_features)
    valid = lgb.Dataset(X_val, label = y_val, categorical_feature=cat_features)
    
    
    # parameter setting
    params1 = {
        'boosting_type': 'gbdt',
        'metric': 'AUC',
        'objective': 'binary',
        'is_unbalance': True,
        'feature_fraction':0.8,
        'bagging_fraction': 0.75,
        'bagging_freq': 10, 
        'colsample_bytree': 0.75,
        'learning_rate': 0.1,
        'max_bin': 510,
        'random_seed': rs,
        'lambda_l1': 1.2}  
     
    params = params1
    
    lgbm = lgb.train(params, 
                    train,
                    num_boost_round = 3000,
                    early_stopping_rounds=300,
                    valid_sets = [valid], 
                    verbose_eval = False)
    

    pred_train = lgbm.predict(X_train)
    pred_val = lgbm.predict(X_val)
    
    return pred_train, pred_val, lgbm


def get_distance(lat1, long1, lat2, long2):
  t = (long2 - long1)**2 + (lat2 - lat1)**2
  return np.sqrt(t)


def apply_age_group(input_f):
  if np.isnan(input_f):
    return 0
  else:
    t_age = 2019 - input_f
    if t_age<18:
      res = 1
    elif t_age>=18 and t_age<25:
      res = 2
    elif t_age>=25 and t_age<35:
      res = 3
    elif t_age>=35 and t_age<45:
      res = 4
    elif t_age>=45 and t_age<55:
      res = 5
    else:
      res = 6
    
    return res


def parse_opening(input_s):
  if str(input_s)!='nan':
    l = str(input_s).split('-')
    return l[0], l[1]
  else:
    return 'not_available', 'not_available'


def account_days(input_s, m):
  diff = m - input_s
  return diff

# Credits to:
# https://www.kdnuggets.com/2018/12/feature-building-techniques-tricks-kaggle.html
# for the ideas of how to process geo coordinates

def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

def dummy_manhattan_distance(lat1, lng1, lat2, lng2):
    a = haversine_array(lat1, lng1, lat1, lng2)
    b = haversine_array(lat1, lng1, lat2, lng1)
    return a + b

def bearing_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))


dummy_manhattan_distance_v = np.vectorize(dummy_manhattan_distance)
haversine_array_v = np.vectorize(haversine_array)
bearing_array_v = np.vectorize(bearing_array)


get_distance_v = np.vectorize(get_distance)
apply_age_group_v = np.vectorize(apply_age_group)
parse_opening_v = np.vectorize(parse_opening)
account_days_v = np.vectorize(account_days)

In [None]:
# Loading the dataframes
test_customers = pd.read_csv(pth+'test_customers.csv')

test_locations = pd.read_csv(pth+'test_locations.csv')

train_customers = pd.read_csv(pth+'train_customers.csv')

train_locations = pd.read_csv(pth+'train_locations.csv')

orders = pd.read_csv(pth+'orders.csv')

vendors = pd.read_csv(pth+'vendors.csv')

SampleSubmission = pd.read_csv(pth+'SampleSubmission.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
# Renaming columns in a proper way

# test_customers
m = test_customers.columns
m = ['C_' + item for item in m]
test_customers.columns = m
test_customers.rename(columns={'C_akeed_customer_id':'customer_id'}, inplace=True)

# train_customers
m = train_customers.columns
m = ['C_' + item for item in m]
train_customers.columns = m
train_customers.rename(columns={'C_akeed_customer_id':'customer_id'}, inplace=True)

# orders
m = orders.columns
m = ['O_' + item for item in m]
orders.columns = m
orders.rename(columns={'O_akeed_order_id':'order_id', 'O_customer_id':'customer_id', 'O_vendor_id':'vendor_id',
                       'O_LOCATION_NUMBER':'location_number', 'O_CID X LOC_NUM X VENDOR':'CID X LOC_NUM X VENDOR'}, inplace=True)

# vendors
m = vendors.columns
m = ['V_' + item for item in m]
vendors.columns = m
vendors.rename(columns={'V_id':'vendor_id'}, inplace=True)


# test_locations
m = test_locations.columns
m = ['L_' + item for item in m]
test_locations.columns = m
test_locations.rename(columns={'L_customer_id':'customer_id', 'L_location_number':'location_number'}, inplace=True)

# train_locations
m = train_locations.columns
m = ['L_' + item for item in m]
train_locations.columns = m
train_locations.rename(columns={'L_customer_id':'customer_id', 'L_location_number':'location_number'}, inplace=True)

In [None]:
# Some feature engineering
q = []
q_tmp = []

q1 = ['O_grand_total', 'O_vendor_discount_amount', 'O_deliverydistance']

for item in q1:
  item_t = item + '_by_vendors'
  orders[item_t] = orders.groupby(['vendor_id'])[item].transform('mean')
  q.append(item_t)

q2 = ['O_vendor_rating', 'O_item_count', 'O_promo_code_discount_percentage', 'O_preparationtime']

for item in q2:
  item_t = item + '_by_vendors'
  orders[item+'_tmp'] = orders[item].fillna(orders.groupby(['vendor_id'])[item].transform('mean'))
  orders[item_t] = orders.groupby(['vendor_id'])[item+'_tmp'].transform('mean')
  q.append(item_t)
  q_tmp.append(item+'_tmp')


orders.drop(q1+q2+q_tmp, axis=1, inplace=True)


In [None]:
# Processing 'vendor_tag_name' field that contains text descriptions
t = list(vendors['V_vendor_tag_name'])

y = []

for item in t:
  if str(item) != 'nan':
    tt = item.split(',')
    y = y + tt

unique_tags = list(set(y))

s = vendors['V_vendor_tag_name'].isnull()==True
vendors.loc[s, 'V_vendor_tag_name'] = 'no_tag'

vendors['V_open_time'], vendors['V_close_time'] = parse_opening_v(vendors['V_OpeningTime'])

In [None]:
# Deleting columns 'V_sunday_from_time1' and all similar,
# don't know exactly what to do with it

cols_to_keep = ['vendor_id', 'V_authentication_id', 'V_latitude', 'V_longitude',
       'V_vendor_category_en', 'V_vendor_category_id', 'V_delivery_charge',
       'V_serving_distance', 'V_is_open', 'V_OpeningTime', 'V_OpeningTime2',
       'V_prepration_time', 'V_commission', 'V_is_akeed_delivering',
       'V_discount_percentage', 'V_status', 'V_verified', 'V_rank',
       'V_language', 'V_vendor_rating', 'V_primary_tags', 'V_open_close_flags',
       'V_vendor_tag', 'V_vendor_tag_name', 'V_one_click_vendor',
       'V_country_id', 'V_city_id', 'V_created_at', 'V_updated_at',
       'V_device_type', 'V_display_orders', 'V_open_time', 'V_close_time']


vendors = vendors[cols_to_keep]

In [None]:
# Parsing 'vendor_tag_name'
cnt = -1

for item in unique_tags:
  cnt += 1
  s = vendors['V_vendor_tag_name'].str.contains(item)
  curr_col = 'V_tag_' + str(cnt)
  vendors.loc[s, curr_col] = 1
  s = vendors[curr_col].isnull()==True
  vendors.loc[s, curr_col] = 0

cols_tags = ['V_tag_' + str(item) for item in range(cnt+1)]

In [None]:
tags = vendors[cols_tags]

In [None]:
# Performing PCA to reduce the number of columns;
# Applied to 'vendor_tag_name'

pca = PCA(n_components=n_comp, svd_solver='full', random_state=rs)
svd_tags = pd.DataFrame(pca.fit_transform(tags))

svd_tags_cols = ['V_svd_tag_' + str(item) for item in range(0, n_comp)]
svd_tags.columns = svd_tags_cols

In [None]:
vendors.drop(cols_tags, axis=1, inplace=True)

vendors = pd.concat([vendors, svd_tags], axis=1)

In [None]:
del tags

In [None]:
train_customers['C_sample'] = 1

test_customers['C_sample'] = 2

customers = train_customers.append(test_customers, ignore_index=True)


In [None]:
def clean_text(text):
    # lowercase
    text = text.lower()

    # Substituting line breaks to spaces
    text = re.sub("^\s+|\n|\r|\s+$", ' ', text)
    
    # Deleting numbers
    text = re.sub(r'\d+', '', text)
    
    # Substituting punctuation (then deleting it)
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Deleting extra spaces
    text = text.strip()
    
    return text


def process_gender(s):

  
  s=clean_text(str(s))

  if 'female' in s:
    res=1
  elif 'male' in s:
    res=2
  else:
    res=99
  return res

process_gender_v = np.vectorize(process_gender)

customers['C_gender_coded'] = process_gender_v(customers['C_gender'])

In [None]:
customers['C_age_group'] = apply_age_group_v(customers['C_dob'])

In [None]:
customers['C_created_at'] = pd.to_datetime(customers['C_created_at'])
m = customers['C_created_at'].max()

customers['C_account_diff'] = account_days_v(customers['C_created_at'], m)

customers['C_account_diff'] = customers['C_account_diff']/np.timedelta64(1,'D')

In [None]:
locations = train_locations.append(test_locations, ignore_index=True)

In [None]:
# Lists of columns to use in the gradient boosting model
cols_to_encode = ['L_location_type',
                  'V_vendor_category_en', 'V_vendor_category_id',
                  'C_gender_coded', 'C_age_group', 'V_serving_distance',
                  'V_delivery_charge', 'V_rank', 'V_language', 'C_verified', 'V_commission', 'V_status',
                  'V_open_time', 'V_close_time', 'vendor_id_x']

cols_float = ['V_prepration_time',
              'V_vendor_rating', 'V_discount_percentage', 'C_account_diff'] + svd_tags_cols

cols_tmp = ['L_latitude', 'L_longitude', 'V_latitude', 'V_longitude']

cols_labels = ['CID X LOC_NUM X VENDOR', 'target', 'customer_id', 'vendor_id', 'order_id', 'location_number', 'C_sample']

cols_keep = cols_to_encode + cols_float + cols_tmp + cols_labels + q

In [None]:
cols_to_delete = set(list(customers.columns)) - set(cols_keep)
customers.drop(cols_to_delete, axis=1, inplace=True)

In [None]:
cols_to_delete = set(list(orders.columns)) - set(cols_keep)
orders.drop(cols_to_delete, axis=1, inplace=True)

In [None]:
cols_to_delete = set(list(vendors.columns)) - set(cols_keep)
vendors.drop(cols_to_delete, axis=1, inplace=True)

In [None]:
# Merges of the tables start

locations_customers = locations.merge(customers, on='customer_id', how='inner')

In [None]:
locations_customers['key1'] = 0
vendors['key1'] = 0

In [None]:
locations_customers_vendors = locations_customers.merge(vendors, how='inner', on='key1')

In [None]:
locations_customers_vendors['CID X LOC_NUM X VENDOR'] = locations_customers_vendors['customer_id'] + ' X ' + locations_customers_vendors['location_number'].astype(str) + ' X ' +  locations_customers_vendors['vendor_id'].astype(str)

In [None]:
orders['target'] = 1

In [None]:
locations_customers_vendors_orders = locations_customers_vendors.merge(orders, how='left', on='CID X LOC_NUM X VENDOR')

In [None]:
for item in q:
  locations_customers_vendors_orders[item] = locations_customers_vendors_orders[item].fillna(locations_customers_vendors_orders.groupby(['vendor_id_x'])[item].transform('mean'))
  locations_customers_vendors_orders[item] = locations_customers_vendors_orders.groupby(['vendor_id_x'])[item].transform('mean')


In [None]:
s = locations_customers_vendors_orders['target'].isnull()==True
locations_customers_vendors_orders.loc[s, 'target'] = 0

In [None]:
locations_customers_vendors_orders['distance1'] = get_distance_v(locations_customers_vendors_orders['L_latitude'],
                                                             locations_customers_vendors_orders['L_longitude'],
                                                             locations_customers_vendors_orders['V_latitude'],
                                                             locations_customers_vendors_orders['V_longitude'])

In [None]:


locations_customers_vendors_orders['distance2'] = dummy_manhattan_distance_v(locations_customers_vendors_orders['L_latitude'],
                                                             locations_customers_vendors_orders['L_longitude'],
                                                             locations_customers_vendors_orders['V_latitude'],
                                                             locations_customers_vendors_orders['V_longitude'])




In [None]:
locations_customers_vendors_orders['distance3'] = haversine_array_v(locations_customers_vendors_orders['L_latitude'],
                                                             locations_customers_vendors_orders['L_longitude'],
                                                             locations_customers_vendors_orders['V_latitude'],
                                                             locations_customers_vendors_orders['V_longitude'])

In [None]:

locations_customers_vendors_orders['distance4'] = bearing_array_v(locations_customers_vendors_orders['L_latitude'],
                                                             locations_customers_vendors_orders['L_longitude'],
                                                             locations_customers_vendors_orders['V_latitude'],
                                                             locations_customers_vendors_orders['V_longitude'])

In [None]:
locations_customers_vendors_orders.loc[:, 'center_latitude'] = (locations_customers_vendors_orders['L_latitude'].values + locations_customers_vendors_orders['V_latitude'].values) / 2
locations_customers_vendors_orders.loc[:, 'center_longitude'] = (locations_customers_vendors_orders['L_longitude'].values + locations_customers_vendors_orders['V_longitude'].values) / 2

In [None]:
for item in cols_to_encode:
  locations_customers_vendors_orders[item] = locations_customers_vendors_orders[item].astype('category')
  locations_customers_vendors_orders[item] = locations_customers_vendors_orders[item].cat.codes
  locations_customers_vendors_orders[item] = locations_customers_vendors_orders[item].astype('category')



In [None]:
cols_float = cols_float + ['distance1', 'distance2', 'distance3', 'distance4', 'center_latitude', 'center_longitude'] + q

cols_X = cols_to_encode + cols_float

In [None]:
sample_train = locations_customers_vendors_orders[locations_customers_vendors_orders['C_sample']==1]
sample_test = locations_customers_vendors_orders[locations_customers_vendors_orders['C_sample']==2]


del locations_customers_vendors_orders
gc.collect()


0

In [None]:
# Creating a delayed sample, deleting it from 'sample_train'
if use_delayed:
  w = sample_train.sample(frac=frac_size, random_state=rs) 
  w_index = w.index
  sample_train = sample_train.drop(w_index, axis=0)

In [None]:
X = sample_train[cols_X]

In [None]:
y = sample_train['target']

In [None]:
# Training the LightGBM classifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs)

print(datetime.datetime.now())

a, b, lgbm_model = lgbm_val(X_train, X_test, y_train, y_test, cols_to_encode)

print(datetime.datetime.now())


2020-08-03 20:35:12.657569




2020-08-03 21:35:33.152592


In [None]:
if use_delayed:
  w_true = np.array(w['target'])
  w = w[cols_X]

In [None]:
# Evaluating the F1 score at the delayed sample
if use_delayed:
  t = range(84, 93, 1)

  t = [item/100 for item in t]


  for elem in t:
    w_predictions = lgbm_model.predict(w)

    w_predictions = np.array([1 if item>=elem else 0 for item in w_predictions])
    print(str(elem) + '...' + str(f1_score(w_true, w_predictions)))

In [None]:
# Saving the results
cutoff_true_false = 0.88

if True:

  labels = sample_test['CID X LOC_NUM X VENDOR']


  sample_test = sample_test[cols_X]

  predictions = lgbm_model.predict(sample_test)

  predictions = [1 if item>=cutoff_true_false else 0 for item in predictions]
  
  predicted = pd.DataFrame(labels)

  predicted[rr] = predictions

  SampleSubmission.drop(['target'], axis=1, inplace=True)

  SampleSubmission = SampleSubmission.merge(predicted, on='CID X LOC_NUM X VENDOR', how='left')

  s = SampleSubmission[rr].isnull()==True

  SampleSubmission.loc[s, rr] = 0

  SampleSubmission[rr] = SampleSubmission[rr].astype('int')

  SampleSubmission.to_csv(pth+submission_name, index=False, sep=',')

  print('Sum of predictions:')
  sum_of_predictions = predicted[rr].sum()
  print(str(sum_of_predictions))

Sum of predictions:
37922
