# Source Code Analysis

## Initialization

In [1]:
import os
import sys

def add_path(path):
    if path not in sys.path:
        sys.path.insert(0, path)
        sys.path.append(path)
# add_path('/home/jjian03/anaconda3/lib/python3.7/site-packages')
# add_path(f'{os.path.abspath(os.path.join("."))}/lib')


### Load Data

In [2]:
import time, datetime, numpy as np, pandas as pd

seed = 77
np.random.seed(seed)

data_file = 'untrunc_data_cleaned_url.csv'

raw_data = pd.read_table(data_file, sep=',', index_col=0)
raw_data = raw_data.dropna()

raw_data.info()

print(f'raw_data: {raw_data.shape}')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58875 entries, 0 to 58911
Data columns (total 42 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   protocol_type                           58875 non-null  float64
 1   has_www                                 58875 non-null  float64
 2   has_iframe                              58875 non-null  float64
 3   int                                     58875 non-null  float64
 4   org                                     58875 non-null  float64
 5   gov                                     58875 non-null  float64
 6   in                                      58875 non-null  float64
 7   eu                                      58875 non-null  float64
 8   cn                                      58875 non-null  float64
 9   kr                                      58875 non-null  float64
 10  url_depth                               58875 non-null  fl

### Train Test Split

In [3]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing


x = raw_data.drop(['label', 'first_appear', 'url'], axis=1)
y = raw_data.label
y_scaler = preprocessing.StandardScaler()
y = y_scaler.fit_transform(y.values.reshape(-1, 1))
y = pd.DataFrame(y).iloc[:,0]


X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size = 0.33, random_state=seed)

In [4]:
import gc
import multiprocessing

import warnings
warnings.filterwarnings("ignore")


cpu_cnt = multiprocessing.cpu_count()
allocated_cpu = cpu_cnt
print(f"Allocated {allocated_cpu} CPUs")
gc.collect()

Allocated 16 CPUs


20

#### L-BFGS-B

In [5]:
from concurrent.futures.thread import ThreadPoolExecutor

from sklearn.model_selection import GridSearchCV

from Toolbox import ParameterGenerator
from tobit import *


start_time = time.time()

p_censor_left = -math.inf
p_censor_right = max(y)


param_tr = {
    'C': [round(i, 2) for i in np.arange(0, 70, 0.4).tolist()],
    'alpha': [.55, 1],
}

tr = TobitRegressor(
    p_censor_left=p_censor_left,
    p_censor_right=p_censor_right,
    C=49.8,
    alpha=1,
    verbose=True,
)

tr.fit(X_train, y_train)


      fun: 17918.27135185433
 hess_inv: <41x41 LbfgsInvHessProduct with dtype=float64>
      jac: array([-8.23377256e+00,  3.14493278e+01, -8.40393370e-01, -1.13238087e+01,
        9.52095976e+00, -2.55298004e-01,  2.70690815e+00,  5.64253671e+01,
        5.18508752e+01, -4.43221465e+01, -5.07738684e+01, -3.45887184e+00,
        2.00241038e-02,  1.35317508e+00, -4.38884492e+00,  1.36754962e+00,
       -2.31780396e+01,  7.07145786e+00,  1.79325114e+01,  2.08855966e+01,
        1.89570750e+01,  1.56369325e+01,  6.94822093e+00,  6.34989952e+00,
       -3.26293075e+01,  4.66239422e+01,  6.35926847e+01,  7.27634021e+01,
       -3.87309743e+01, -4.01852202e-01,  2.83143473e+01, -1.18607041e+00,
        1.37064684e-01,  2.66583806e+00,  1.13703432e+00, -5.98335301e+00,
       -9.88503164e+00, -1.26277565e+01, -1.77231629e+01,  3.47348876e+00,
        1.54603380e+01])
  message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 174
      nit: 138
   status: 0
  success: True
      

TobitRegressor(C=49.8, alpha=1, p_censor_left=-inf,
               p_censor_right=0.2627393823832124, verbose=True)

### Predict

In [6]:
x = raw_data[raw_data.label == 30].drop(['label', 'first_appear', 'url'], axis=1)

pred = tr.predict(X_test)
pred

def get_pseudo_r2(y_true, y_hat):
    correlation_matrix = np.corrcoef(y_true, y_hat)
    correlation_xy = correlation_matrix[0,1]
    return correlation_xy**2

get_pseudo_r2(y_test, pred)

0.12378856524451701

In [7]:
y_pred = y_scaler.inverse_transform(pred)
y_pred = y_pred + 1990 - 2021

In [8]:
death_count = pd.Series(y_pred)
death_count.name = 'death_count'
dead_urls = pd.merge(death_count, raw_data.url, how='inner',
         left_index=True, right_index=True)

raw_data = raw_data.drop('url', axis=1)
dead_urls = pd.merge(dead_urls, raw_data, how='inner',
         left_index=True, right_index=True)

# dead_urls = dead_urls[dead_urls.death_count < 1]
dead_urls = dead_urls[dead_urls.death_count > 0]
dead_urls = dead_urls[dead_urls.label == 30]
dead_urls = dead_urls[dead_urls.url.apply(lambda url: not url.startswith('http://orcid.org'))]
dead_urls = dead_urls[dead_urls.url.apply(lambda url: not url.startswith('https://orcid.org'))]
# dead_urls.to_csv('survival_prediction.csv')

In [9]:
dead_urls = dead_urls.sort_values(by="death_count")
dead_urls = dead_urls.iloc[:3,:].append(dead_urls.iloc[-3:,:])

dead_urls.to_csv('survival_prediction.csv')