# Interview challenge for PayPay.

Data: markplace web session log on data folder

Machine Learning Processing & Analytical goal:

    - Predict the session length for a given IP

In [1]:
# Libraries and setup
# Auto reload changes
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../") # go to parent dir

# source dependencies, log_file_schema & data handler
from src.dependencies import *
from src.log_file_schema import schema
from src.data_handler import DataHandler

# plot lib
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# forcasting lib and matrics
# https://xgboost.readthedocs.io/en/latest/
import xgboost
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [3]:
#create or get spark session
spark = SparkSession.builder \
    .master("local[*]")\
    .appName("PayPayChallenge")\
    .getOrCreate()

In [4]:
def duration(start, end):
    """
    Retunr time duration in seconds

    @param start: timestamp
    @param end: timestamnp
    """
    try:
        num_of_seconds = (end - start).total_seconds()
    except:
        num_of_seconds = 0
    return num_of_seconds

get_duration = udf(duration, FloatType())

def preprocess_data(spark):
    """
    Processing the data 
    :param spark: spark session
    :return processed data 
    """
    df = spark.read.csv(log_file, schema=schema, sep=" ").repartition(num_partitions).cache()
    split_client = split(df["client:port"], ":")
    split_backend = split(df["backend:port"], ":")
    split_request = split(df["request"], " ")

    df=df.withColumn("client_ip", split_client.getItem(0)) \
                .withColumn("client_port", split_client.getItem(1)) \
                .withColumn("backend_ip", split_backend.getItem(0)) \
                .withColumn("backend_port", split_backend.getItem(1)) \
                .withColumn("request_action", split_request.getItem(0)) \
                .withColumn("request_url", split_request.getItem(1)) \
                .withColumn("request_protocol", split_request.getItem(2)) \
                .withColumn("current_timestamp", col("timestamp").cast("timestamp")) \
                .drop("client:port","backend:port","request").cache()
    
    window_func_ip = Window.partitionBy("client_ip").orderBy("current_timestamp")
    df = df.withColumn("previous_timestamp",
                            lag(col("current_timestamp")).over(window_func_ip)) \
                .withColumn("session_duration",
                            get_duration(col("previous_timestamp"), col("current_timestamp"))) \
                .withColumn("is_new_session",
                            when((col("session_duration") > session_time), 1).otherwise(0)) \
                .withColumn("count_session",
                            sum(col("is_new_session")).over(window_func_ip)) \
                .withColumn("ip_session_count",
                            concat_ws("_", col("client_ip"), col("count_session")))

    df=df.select(["ip_session_count", "client_ip", "request_url", "previous_timestamp", 
                  "current_timestamp", "session_duration", "is_new_session", "count_session"]);
    
    window_func_session = Window.partitionBy("ip_session_count").orderBy("current_timestamp")
    df = df.withColumn("previous_timestamp_session",
                              lag(df["current_timestamp"]).over(window_func_session)) \
                  .withColumn("current_session_duration",
                              get_duration(col("previous_timestamp_session"), col("current_timestamp")))
    df = df.groupby("ip_session_count").agg(
            sum("current_session_duration").alias("session_length")).cache()
    
    splitt= split(df["ip_session_count"], "_")
    df=df.withColumn("ip", splitt.getItem(0))
    df=df.select(["ip", "session_length"])
    df=df.na.drop(subset=["session_length"])
    df=df.na.drop(subset=["ip"])
    
    splitt2=split(df["ip"], "\\.")
    df=df.withColumn("octet0", splitt2.getItem(0))
    df=df.withColumn("octet1", splitt2.getItem(1))
    df=df.withColumn("octet2", splitt2.getItem(2))
    df=df.withColumn("octet3", splitt2.getItem(3))
    
    df=df.drop("ip")
    df=df.na.drop(subset=["octet0"])
    df=df.na.drop(subset=["octet1"])
    df=df.na.drop(subset=["octet2"])
    df=df.na.drop(subset=["octet3"])
    
    return df


def data_loader(spark):
    """
    Data loader
    
    @param spark: spark session
    """
    dataset2 = preprocess_data(spark).cache()
    dataset2.show()
    return dataset2.select("*").toPandas()

df=data_loader(spark)

+------------------+------+------+------+------+
|    session_length|octet0|octet1|octet2|octet3|
+------------------+------+------+------+------+
| 69.81707191467285|     1|   186|    41|     1|
| 231.7906957184896|     1|   186|    76|    11|
| 33.04862296581268|     1|   187|   228|   210|
| 33.92300724051893|     1|   187|   228|    88|
| 59.14387809485197|     1|    23|   101|   102|
| 9.247098922729492|     1|    23|   226|    88|
| 210.9620418548584|     1|    38|    21|    65|
|48.736650466918945|     1|    38|    22|   121|
|23.680201530456543|     1|    38|    23|    24|
| 7.674998104572296|     1|    39|    14|   229|
| 67.07872497009816|     1|    39|    15|   172|
|3.8426599502563477|     1|    39|    32|    15|
|22.044874668121338|     1|    39|    35|   191|
|10.105690002441406|     1|    39|    35|   217|
|1.5804920196533203|     1|    39|    46|   200|
| 177.6636962890625|     1|    39|    60|   199|
|168.64113640785217|     1|    39|    60|    35|
| 9.330794131412404|

In [5]:
df = df[['octet0', 'octet1', 'octet2', 'octet3', 'session_length']]
print(df.head())
print(df.shape)

  octet0 octet1 octet2 octet3  session_length
0      1    186     41      1       69.817072
1      1    186     76     11      231.790696
2      1    187    228    210       33.048623
3      1    187    228     88       33.923007
4      1     23    101    102       59.143878
(88867, 5)


In [7]:
#  lib and matrics
# https://xgboost.readthedocs.io/en/latest/
import numpy as np

df = df.apply(pd.to_numeric)
X = df[['octet0', 'octet1', 'octet2', 'octet3']]
Y = df[['session_length']]

model = xgboost.XGBRegressor(objective='reg:squarederror')
kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(model, X, Y, cv=kfold, scoring='neg_mean_squared_error')
print("RMSE xgboost:", np.mean(np.sqrt(np.abs(results))))

RMSE xgboost: 252.1921026861234


In [8]:
#  lib and matrics
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor()
model_rf.fit(X, Y)
kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(model_rf, X, Y, cv=kfold, scoring='neg_mean_squared_error')
print("RMSE Randomforest:", np.mean(np.sqrt(np.abs(results))))


RMSE Randomforest: 278.25198155790395
