In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/restaurant-recommendation-challenge/train_locations.csv
/kaggle/input/restaurant-recommendation-challenge/test_full.csv
/kaggle/input/restaurant-recommendation-challenge/VariableDefinitions.txt
/kaggle/input/restaurant-recommendation-challenge/vendors.csv
/kaggle/input/restaurant-recommendation-challenge/SampleSubmission (1).csv
/kaggle/input/restaurant-recommendation-challenge/test_locations.csv
/kaggle/input/restaurant-recommendation-challenge/test_customers.csv
/kaggle/input/restaurant-recommendation-challenge/train_customers.csv
/kaggle/input/restaurant-recommendation-challenge/orders.csv
/kaggle/input/restaurant-recommendation-challenge/train_full.csv


### Installing pyspark

In [2]:
!pip3 install pyspark

Collecting pyspark
  Downloading pyspark-3.0.1.tar.gz (204.2 MB)
[K     |████████████████████████████████| 204.2 MB 29 kB/s s eta 0:00:01   |█▉                              | 11.5 MB 7.0 MB/s eta 0:00:28
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 57.8 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612244 sha256=b5ef3724334bce94b540c52036a71e339570f246e1af91a6b8ff9c6035527b99
  Stored in directory: /root/.cache/pip/wheels/5e/34/fa/b37b5cef503fc5148b478b2495043ba61b079120b7ff379f9b
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


# Import required libraries

In [3]:
# start spark session

from pyspark import SparkContext
sc = SparkContext()
from pyspark import SQLContext
sqlContext = SQLContext(sc)

In [4]:
from tqdm.notebook import tqdm
from pyspark.sql.types import *
import pyspark.sql.functions as F
import pyspark.ml as ml
import pandas as pd

from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import matplotlib.pyplot as plt

# Read data

In [5]:
data = sqlContext.read.format('csv').options(header='true').load("../input/restaurant-recommendation-challenge/train_full.csv")
orders = sqlContext.read.format('csv').options(header='true').load("../input/restaurant-recommendation-challenge/orders.csv")
train_cus =sqlContext.read.format('csv').options(header='true').load("../input/restaurant-recommendation-challenge/train_customers.csv")

In [6]:
#Top 5 rows
data.show(5)

+-----------+------+--------+----------+-------------------+-------------------+---------------+-------------+----------+-----------+---+-----------------+----------+-----------+------------------+------------------+---------------+----------------+-------+---------------+------------+---------------+----------+-------------------+-------------------+--------+----------+----+--------+-------------+-----------------+---------------+-----------------+---------------+-----------------+---------------+-----------------+---------------+------------------+----------------+------------------+----------------+--------------------+------------------+--------------------+------------------+-------------------+-----------------+-------------------+-----------------+-----------------+---------------+-----------------+---------------+-------------------+-----------------+-------------------+-----------------+--------------------+----------------+--------------------+--------------------+-----------

# Feature engineering 

#### Dropping unnecessary features

In [7]:
weekdays = ['monday', 'tuesday', 'wednesday', 'thursday', 
            'friday', 'saturday', 'sunday']

In [8]:

for col in weekdays:
    for column in data.columns:
        if col in column:
            data = data.drop(column)

In [9]:
col = ['commission', 'display_orders', 'country_id', 'CID X LOC_NUM X VENDOR',
    'city_id', 'vendor_category_en', 'latitude_x','latitude_y', 'longitude_x', 'longitude_y','akeed_order_id', 'CID X LOC_NUM X VENDOR']
for c in col:
    data = data.drop(c)

In [10]:
train_cus = train_cus.drop("language")

#### Converting datatype of numerical Features

In [11]:
num_cols = ['delivery_charge', 'serving_distance', 'vendor_rating', 
                'prepration_time', 'discount_percentage', 'verified_x', 
                'is_open', 'status_y', 'verified_y', 'rank', 
                'open_close_flags', 'location_number_obj']

In [12]:
for col in num_cols:
   data = data.withColumn(
        col, data[col].cast(DoubleType()))

data = data.withColumn(
        'target', data['target'].cast(DoubleType()))

In [13]:
data = data.withColumn(
    'primary_tags', F.regexp_extract(
        data['primary_tags'], r"[0-9]+", 0))

data = data.withColumn(
    'primary_tags', data[col].cast(DoubleType()))


#### Merging train_full.csv with train_customers.csv on customer_id

In [14]:
data = data.withColumnRenamed('akeed_customer_id', 'customer_id')
train_cus = train_cus.withColumnRenamed('akeed_customer_id', 'customer_id')
complete_data = data.join(train_cus, on=['customer_id'])

In [15]:
complete_data = complete_data.drop('gender').drop('language').drop("dob")

In [16]:
#Handling nan values
complete_data = complete_data.fillna({'location_type': 'unknown'})

In [17]:
# Most of these features contains null values on a large scale
drop_cols = ['OpeningTime', 'OpeningTime2', 'language', 
           'customer_id', 'vendor_tag', 'vendor_tag_name', 
           'created_at', 'updated_at', 'id', 'authentication_id', 
           'id_obj', 'is_akeed_delivering', 'one_click_vendor','created_at_x','created_at_y','updated_at_x','updated_at_y']

In [18]:
for col in drop_cols:
    complete_data.drop(col)

In [19]:
categorical_features = ['location_number', 'location_type', 'status_x',
               'vendor_category_id', 'device_type', 'status', 
               'verified']

In [20]:
# Encoding categorical features
for col in categorical_features:
    stringIndexer = ml.feature.StringIndexer(inputCol=col, outputCol=col + "_ind")
    indexer = stringIndexer.fit(complete_data)
    complete_data = indexer.transform(complete_data)
    encoder = ml.feature.OneHotEncoder(
        inputCols=[stringIndexer.getOutputCol()], outputCols=[col + "_ohe"])
    ohe_encoder = encoder.fit(complete_data)
    complete_data = ohe_encoder.transform(complete_data)
    

In [21]:
# defining output features
columns_extracted = num_cols + [col+'_ohe' for col in categorical_features]
assembler = ml.feature.VectorAssembler(inputCols=columns_extracted, outputCol="features")

train_data = assembler.transform(complete_data)

#### Balancing data based on target feature

In [22]:
balanced_data = train_data.filter(train_data.target==1.0)
sampled_0 = train_data.filter(train_data.target==0.0).distinct()
sampled_0 = sampled_0.sample(False, fraction = balanced_data.count()/sampled_0.count())

In [23]:
balanced_data = balanced_data.unionByName(sampled_0)

In [24]:
train, test = balanced_data.randomSplit([0.80,0.20], seed=52)

# Models

## Logistic regression

In [25]:
#Training and predicting with logistic regression
log_reg = ml.classification.LogisticRegression(labelCol='target', featuresCol='features')
log_reg=log_reg.fit(train)

predicted_log = log_reg.transform(test)

## Decision Tree Classifier

In [26]:
#Training and predicting with decision Tree classifier
tree = ml.classification.DecisionTreeClassifier(labelCol='target', featuresCol='features')
tree = tree.fit(train)

predict_tree = tree.transform(test)

#### Metrics

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol = "target")

In [27]:
predictionAndLabels = predicted_log.select("prediction", "target")
#F1score
fscore_log = evaluator.evaluate(predictionAndLabels)
#Accuracy
accuracy_log = evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "accuracy"})
#precision
precision_log = evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "weightedPrecision"})
#Recall
recall_log = evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "weightedRecall"})

NameError: name 'evaluator' is not defined

In [None]:
predictionAndLabels = predict_tree.select("prediction", "target")

fscore_tree = evaluator.evaluate(predictionAndLabels)
accuracy_tree = evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "accuracy"})
precision_tree = evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "weightedPrecision"})
recall_tree = evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "weightedRecall"})

## Plotting values

In [None]:
plt.title('Accuracy Comparison')
plt.bar(['Logistic Regression','Decision Tree'],[accuracy_log,accuracy_tree],width=0.8,color='g')
plt.xticks(rotation=90)
plt.show()


In [None]:
plt.title('F1Score Comparison')
plt.bar(['Logistic Regression','Decision Tree'],[fscore_log,fscore_tree],width=0.8,color='g')
plt.xticks(rotation=90)
plt.show()
print("Logistic Regression : {}\nDecision Tree : {}".format(fscore_log,fscore_tree))

In [None]:
plt.title('Precision Comparison')
plt.bar(['Logistic Regression','Decision Tree'],[precision_log,precision_tree],width=0.8,color='g')
plt.xticks(rotation=90)
plt.show()
print("Logistic Regression : {}\nDecision Tree : {}".format(precision_log,precision_tree))

In [None]:
plt.title('Recall Comparison')
plt.bar(['Logistic Regression','Decision Tree'],[recall_log,recall_tree],width=0.8,color='g')
plt.xticks(rotation=90)
plt.show()
print("Logistic Regression : {}\nDecision Tree : {}".format(recall_log,recall_tree))