# Interview challenge for PayPay.

Data: markplace web session log on data folder

Machine Learning Processing & Analytical goal:

    - Predict the number of unique URL visits by a given IP

In [1]:
# Libraries and setup
# Auto reload changes
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../") # go to parent dir

import numpy as np

# source dependencies, log_file_schema & data handler
from src.dependencies import *
from src.log_file_schema import schema
from src.data_handler import DataHandler

# plot lib
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# forcasting lib and matrics
# https://xgboost.readthedocs.io/en/latest/
import xgboost
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [2]:
#create or get spark session
spark = SparkSession.builder \
    .master("local[*]")\
    .appName("PayPayChallenge")\
    .getOrCreate()

In [3]:
def duration(start, end):
    """
    Retunr time duration in seconds

    @param start: timestamp
    @param end: timestamnp
    """
    try:
        num_of_seconds = (end - start).total_seconds()
    except:
        num_of_seconds = 0
    return num_of_seconds

get_duration = udf(duration, FloatType())

def preprocess_data(spark):
    """
    Processing the data 
    :param spark: spark session
    :return processed data 
    """
    df = spark.read.csv(log_file, schema=schema, sep=" ").repartition(num_partitions).cache()
    split_client = split(df["client:port"], ":")
    split_backend = split(df["backend:port"], ":")
    split_request = split(df["request"], " ")

    df=df.withColumn("ip", split_client.getItem(0)) \
                .withColumn("client_port", split_client.getItem(1)) \
                .withColumn("backend_ip", split_backend.getItem(0)) \
                .withColumn("backend_port", split_backend.getItem(1)) \
                .withColumn("request_action", split_request.getItem(0)) \
                .withColumn("request_url", split_request.getItem(1)) \
                .withColumn("request_protocol", split_request.getItem(2)) \
                .withColumn("current_timestamp", col("timestamp").cast("timestamp")) \
                .drop("client:port","backend:port","request").cache()

    df=df.select(["ip", "request_url"])
    df=df.na.drop(subset=["request_url"])
    df=df.na.drop(subset=["ip"])
    
    df = df.groupby("ip").agg(countDistinct("request_url").alias("count_unique_URLs"))
    df=df.na.drop(subset=["count_unique_URLs"])
    
    splitt2=split(df["ip"], "\\.")
    df=df.withColumn("octet0", splitt2.getItem(0))
    df=df.withColumn("octet1", splitt2.getItem(1))
    df=df.withColumn("octet2", splitt2.getItem(2))
    df=df.withColumn("octet3", splitt2.getItem(3))
    
    df=df.drop("ip")
    df=df.na.drop(subset=["octet0"])
    df=df.na.drop(subset=["octet1"])
    df=df.na.drop(subset=["octet2"])
    df=df.na.drop(subset=["octet3"])
    #print(df.dtypes);
    
    return df


def data_loader(spark):
    """
    Data loader
    
    @param spark: spark session
    """
    dataset3 = preprocess_data(spark).cache()
    dataset3.show()
    return dataset3.select("*").toPandas()

df=data_loader(spark)

+-----------------+------+------+------+------+
|count_unique_URLs|octet0|octet1|octet2|octet3|
+-----------------+------+------+------+------+
|               14|    59|   160|   110|   163|
|                6|    27|    63|   186|    72|
|                7|   120|    61|    47|    36|
|               85|   115|   112|   250|   108|
|               16|    61|    16|   142|   162|
|               10|   123|   136|   182|   137|
|               16|   117|   205|    39|   248|
|                9|   117|   247|   188|    13|
|               84|   113|   193|   114|    25|
|              108|    14|   139|    82|   134|
|              110|   202|    53|    89|   132|
|               34|   117|   241|   152|    20|
|                3|   117|   207|    97|   173|
|              112|    27|    34|   244|   251|
|                2|   117|   203|   181|   144|
|               88|   124|   125|    22|   218|
|               16|   202|   174|    92|    10|
|               37|   103|    42|    88|

In [4]:
df=df[['octet0', 'octet1', 'octet2', 'octet3', 'count_unique_URLs']]

print(df.head())
print(df.shape)

  octet0 octet1 octet2 octet3  count_unique_URLs
0     59    160    110    163                 14
1     27     63    186     72                  6
2    120     61     47     36                  7
3    115    112    250    108                 85
4     61     16    142    162                 16
(90544, 5)


In [5]:
#  lib and matrics
# https://xgboost.readthedocs.io/en/latest/

df = df.apply(pd.to_numeric)
X = df[['octet0', 'octet1', 'octet2', 'octet3']]
Y = df[['count_unique_URLs']]

model = xgboost.XGBRegressor(objective='reg:squarederror')
kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(model, X, Y, cv=kfold, scoring='neg_mean_squared_error')

print("RMSE:", np.mean(np.sqrt(np.abs(results))))

RMSE: 98.38995270596601


In [6]:
#  lib and matrics
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor()
model_rf.fit(X, Y)
kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(model_rf, X, Y, cv=kfold, scoring='neg_mean_squared_error')

print("RMSE Randomforest:", np.mean(np.sqrt(np.abs(results))))

RMSE Randomforest: 81.50367587573092
