In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
interactions = pd.read_csv("/kaggle/input/olx-jobs-interactions/interactions.csv")

In [None]:
interactions.head()

In [None]:
interactions.info()

# Basics statistics of the dataset

In [None]:
n_users = interactions["user"].nunique()
n_items = interactions["item"].nunique()
n_interactions = interactions.shape[0]

interactions_per_user = interactions.groupby("user").size()
interactions_per_item = interactions.groupby("item").size()

print(f"We have {n_users} users, {n_items} items and {n_interactions} interactions.")
print(f"Data sparsity (% of missing entries) is {100 * (1- n_interactions / (n_users * n_items)):.2f}%.")
print(f"Average number of interactions per user is {interactions_per_user.mean():.2f} (standard deviation {interactions_per_user.std(ddof=0):.2f}).")
print(f"Average number of interactions per item is {interactions_per_item.mean():.2f} (standard deviation {interactions_per_item.std(ddof=0):.2f}).")

 # Interactions distribution per user

In [None]:
def compute_quantiles(series, quantiles=[0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]):
    return pd.DataFrame(
        [[quantile, series.quantile(quantile)] for quantile in quantiles],
        columns=["quantile", "value"],
    )


def plot_interactions_distribution(series, aggregation="user", ylabel="Users", bins=30):
    matplotlib.rcParams.update({"font.size": 22})
    series.plot.hist(bins=bins, rwidth=0.9, logy=True, figsize=(16, 9))
    plt.title(f"Number of interactions per {aggregation}")
    plt.xlabel("Interactions")
    plt.ylabel(ylabel)
    plt.grid(axis="y", alpha=0.5)

In [None]:
print("Interactions distribution per user:")
display(compute_quantiles(interactions_per_user))
plot_interactions_distribution(interactions_per_user, "user", "Users")

# Interactions distribution per item

In [None]:
print("Interactions distribution per item:")
display(compute_quantiles(interactions_per_item))
plot_interactions_distribution(interactions_per_item, "item", "Items")

#  Events distribution

In [None]:
event_frequency = pd.DataFrame(
    interactions["event"].value_counts() / len(interactions)
).rename(columns={"event": "frequency"})

event_frequency["frequency"] = event_frequency["frequency"].apply(
    lambda x: f"{100*x:.2f}%"
)
event_frequency

# Interactions over time

In [None]:
def unix_to_day(timestamps):
    min_timestamp = timestamps.min()
    seconds_in_day = 60*60*24
    return (timestamps - min_timestamp) // seconds_in_day + 1

def plot_interactions_over_time(series):
    freq = series.value_counts()
    labels, counts = freq.index, freq.values/10**6
    
    matplotlib.rcParams.update({"font.size": 22})
    plt.figure(figsize=(16,5))
    plt.bar(labels, counts, align='center')
    plt.gca().set_xticks(labels)
    plt.title(f"Interactions by days")
    plt.xlabel("Day")
    plt.ylabel("Interactions [mln]")
    plt.grid(axis="y")

plot_interactions_over_time(unix_to_day(interactions["timestamp"]))