# Probability of Item Loss

In [None]:
%load_ext sql
%config SqlMagic.autopandas=True
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np
import datetime
import random

In [None]:
%sql mysql://root:password@localhost/nftdata

A statistical distribution will be fitted to the time the token remains in a wallet before it is stolen.

In [None]:
# Read the date where it was purchased and stolen
df_purchase = %sql SELECT purchase_date FROM nftdata.exp_var where stolen=1 and purchase_date is not null
df_theft = %sql SELECT theft_date FROM nftdata.exp_var where stolen=1 and purchase_date is not null
#df_purchase = %sql SELECT purchase_date FROM nftdata.exp_var where stolen=1 and (attack like '%ice phishing' or attack in ('Swap scam','API exploit')) and purchase_date is not null
#df_theft = %sql SELECT theft_date FROM nftdata.exp_var where stolen=1 and (attack like '%ice phishing' or attack in ('Swap scam','API exploit')) and purchase_date is not null

In [None]:
claim_freq = df_theft.theft_date-df_purchase.purchase_date # Days elapsed until theft
claim_freq = [x.total_seconds()/3600/24 for x in claim_freq if isinstance(x, datetime.timedelta)]

In [None]:
plt.figure(figsize=(12, 6)) # Many observations are concentrated in two big spikes
plt.hist(claim_freq,bins=120)
plt.grid(False)
plt.grid(True, axis='y')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.show()

In [None]:
hist, bins = np.histogram(claim_freq, bins=120)

i = np.argmax(hist)

while hist[i]>=10:
    bin_start = bins[i]
    bin_end = bins[i + 1]

    # Filter the data points within the bin with the highest count
    for i in range(len(claim_freq)):
        if bin_start <= claim_freq[i] < bin_end:
            claim_freq[i] += np.random.uniform(-10, 10) # Jittering is applied
        
    hist, bins = np.histogram(claim_freq, bins=120)
    i=np.argmax(hist)

In [None]:
import seaborn as sns
from fitter import Fitter, get_common_distributions, get_distributions # Fitter will be used to find the best fit for the data
claim_freq = np.asarray(claim_freq)

In [None]:
from scipy import stats

# Get all distribution names supported by scipy
all_distributions = [d for d in dir(stats) if isinstance(getattr(stats, d), stats.rv_continuous) or isinstance(getattr(stats, d), stats.rv_discrete)]

In [None]:
f = Fitter(claim_freq,distributions=['alpha','nct','invweibull','norminvgauss','t','genextreme']) # Change distributions list by all_distributions to fit all supported distributions -> long processing time
f.fit()

In [None]:
import warnings
warnings.filterwarnings("ignore")
fig, ax = plt.subplots(figsize=(12, 6))
f.summary() # Plot the fitted distributions along with the criteria used

In [None]:
params=f.fitted_param["alpha"] # Shape, location and scale parameters
# alpha_hot(0.00027134742268094484, -27.218316239004885, 84.65056159497132)
# alpha_cold(0.0023841802141549885, -21.945890970672338, 82.51257794351142)

In [None]:
# Comparison of the shape of both distributions
from scipy.stats import alpha
from scipy.stats import nct
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 120, 1000)
pdf1=alpha.pdf(x,0.00027134742268094484, -27.218316239004885, 84.65056159497132)
pdf2=alpha.pdf(x,0.0023841802141549885, -21.945890970672338, 82.51257794351142)

plt.figure(figsize=(12, 6))  

plt.plot(x, pdf1, label='Hot',color='orange',alpha=0.5)
plt.plot(x, pdf2, label='Cold',color='blue',alpha=0.5)

x_fill = np.linspace(0, 120, 100)
plt.fill_between(x_fill, alpha.pdf(x_fill,0.00027134742268094484, -27.218316239004885, 84.65056159497132), color='orange', alpha=0.3)
plt.fill_between(x_fill, alpha.pdf(x_fill,0.0023841802141549885, -21.945890970672338, 82.51257794351142), color='blue', alpha=0.3)

plt.xlabel('Days')
plt.ylabel('PDF')

plt.legend()
plt.axvline(x=30, linestyle='dotted', color='red')

plt.grid(True, axis='y')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
ax = plt.gca()
ax.spines['bottom'].set_position('zero')
plt.show()

There is a point where the less riskier group is perceived with a higher risk than the other because of the shape of the distributions, that point should be found and an adjustment in the parameters should be made to ensure a fair pricing mechanism.

In [None]:
b=0
j=0
for i in range(0,300):
    x=alpha.cdf(i,0.00027134742268094484, -27.218316239004885, 84.65056159497132)
    y=alpha.cdf(i,0.0023841802141549885, -21.945890970672338, 82.51257794351142)
    if((x-y)>b):
        b=x-y
        j=i
print('Parameter adjustment should be made after: %s days'%j)
x=alpha.cdf(j,0.00027134742268094484, -27.218316239004885, 84.65056159497132)
y=alpha.cdf(j,0.0023841802141549885, -21.945890970672338, 82.51257794351142)
print("The value of the CDFs at that point is: %s for hot wallets and %s for cold wallets" % (x, y))