### Estimate VAR and PanelOLS

Get the data on python package downloads and run a VAR and a PanelOLS model

In [15]:
import statsmodels.api as sm
from statsmodels.tsa.stattools import grangercausalitytests
import pandas as pd

In [3]:
# Load the 50k data
#df = pd.read_csv("data/bq-results-20230603-222533-1685831182985.csv")
df = pd.read_csv("data/bq-results-20230603-234413-1685835893436.csv.zip")
df

Unnamed: 0,timestamp_date,file_project,installer_name,downloads
0,2023-06-03,0-orchestrator,pip,1
1,2023-06-03,0-orchestrator,Browser,1
2,2023-06-03,0-orchestrator,bandersnatch,26
3,2023-06-03,0rest,bandersnatch,2
4,2023-06-03,0rest,pip,2
...,...,...,...,...
1619282,2023-05-04,zombiedice,pip,1
1619283,2023-05-04,zwave-me-ws,pip,4
1619284,2023-05-04,zxcvbn,requests,43
1619285,2023-05-04,zxcvbn,,5


In [4]:
df['installer_name'].unique()

array(['pip', 'Browser', 'bandersnatch', nan, 'setuptools', 'Nexus',
       'requests', 'devpi', 'pdm', 'Homebrew', 'Artifactory', 'OS',
       'Bazel', 'pex', 'conda', 'chaquopy'], dtype=object)

In [5]:
df['bot_or_not'] = df['installer_name'].replace({'pip': 'human', 
                                                 'Browser': 'bot', 
                                                 'bandersnatch': 'bot',
                                                 'setuptools': 'human',
                                                 'Nexus': 'human',
                                                 'requests': 'bot',
                                                 'devpi': 'bot',
                                                 'pdm': 'human',
                                                 'Homebrew': 'human',
                                                 'Artifactory': 'human',
                                                 'OS': 'human',
                                                 'Bazel': 'human',
                                                 'pex': 'human',
                                                 'conda': 'human',
                                                 'chaquopy': 'human',
                                                })
df.drop(['installer_name'], axis = 1)
df['bot_or_not'] = df['bot_or_not'].fillna('bot')
df['bot_or_not'].unique()

array(['human', 'bot'], dtype=object)

In [6]:
# Convert the long dataset into a wide dataset by installer_name
wide_df = df.pivot_table(index=['timestamp_date', 'file_project'], columns='bot_or_not', values='downloads', fill_value=0)

# Reset the index to have a clean output
wide_df = wide_df.reset_index()
wide_df


bot_or_not,timestamp_date,file_project,bot,human
0,2023-05-04,3to2,0.0,61.0
1,2023-05-04,4chan-biz-mentions,1.0,0.0
2,2023-05-04,51degrees-mobile-detector-v3-wrapper,0.0,18.0
3,2023-05-04,a,16.0,6.0
4,2023-05-04,a-cv-imwrite-imread-plus,0.0,1.0
...,...,...,...,...
926108,2023-06-03,zzu-low2,1.0,0.0
926109,2023-06-03,zzx-deep-genome,6.0,0.0
926110,2023-06-03,zzz,2.0,1.0
926111,2023-06-03,zzz233,1.0,0.0


In [11]:
# Create the VAR model
model = sm.tsa.VAR(wide_df[['human', 'bot']])

selected_order = model.select_order(maxlags=10)
# Fit the VAR model with the selected lag order
results = model.fit(maxlags=selected_order.aic)
print(results.params)

                human        bot
const      342.288423  38.647960
L1.human     0.005393  -0.000257
L1.bot      -0.036437   0.001911
L2.human     0.001672  -0.000386
L2.bot       0.094440   0.009362
L3.human     0.008021  -0.000236
L3.bot      -0.027443   0.003151
L4.human     0.003158  -0.000122
L4.bot      -0.026132   0.000743
L5.human     0.008221  -0.000029
L5.bot      -0.059953   0.000120
L6.human     0.006190  -0.000122
L6.bot      -0.046532   0.000594
L7.human     0.009477  -0.000154
L7.bot      -0.064964   0.001152
L8.human     0.017327   0.000068
L8.bot      -0.119794  -0.000428
L9.human     0.004210   0.000027
L9.bot      -0.027368   0.000170
L10.human    0.006307  -0.000112
L10.bot     -0.046645   0.000566


In [18]:
lag_order = results.k_ar
causality_results = grangercausalitytests(wide_df[["human", "bot"]], lag_order)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=6.7630  , p=0.0093  , df_denom=926109, df_num=1
ssr based chi2 test:   chi2=6.7630  , p=0.0093  , df=1
likelihood ratio test: chi2=6.7630  , p=0.0093  , df=1
parameter F test:         F=6.7630  , p=0.0093  , df_denom=926109, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=20.0907 , p=0.0000  , df_denom=926106, df_num=2
ssr based chi2 test:   chi2=40.1816 , p=0.0000  , df=2
likelihood ratio test: chi2=40.1807 , p=0.0000  , df=2
parameter F test:         F=20.0907 , p=0.0000  , df_denom=926106, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=14.6439 , p=0.0000  , df_denom=926103, df_num=3
ssr based chi2 test:   chi2=43.9321 , p=0.0000  , df=3
likelihood ratio test: chi2=43.9311 , p=0.0000  , df=3
parameter F test:         F=14.6439 , p=0.0000  , df_denom=926103, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=11

In [19]:
causality_results = grangercausalitytests(wide_df[["bot", "human"]], lag_order)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=1.1575  , p=0.2820  , df_denom=926109, df_num=1
ssr based chi2 test:   chi2=1.1575  , p=0.2820  , df=1
likelihood ratio test: chi2=1.1575  , p=0.2820  , df=1
parameter F test:         F=1.1575  , p=0.2820  , df_denom=926109, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=1.8942  , p=0.1504  , df_denom=926106, df_num=2
ssr based chi2 test:   chi2=3.7885  , p=0.1504  , df=2
likelihood ratio test: chi2=3.7885  , p=0.1504  , df=2
parameter F test:         F=1.8942  , p=0.1504  , df_denom=926106, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=1.6061  , p=0.1856  , df_denom=926103, df_num=3
ssr based chi2 test:   chi2=4.8184  , p=0.1856  , df=3
likelihood ratio test: chi2=4.8184  , p=0.1856  , df=3
parameter F test:         F=1.6061  , p=0.1856  , df_denom=926103, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=1.

### Panel OLS
from linearmodels.panel import PanelOLS
from linearmodels import PooledOLS
import statsmodels.api as sm

# Set the index to the firm and year variables
df = wide_df.set_index(['timestamp_date', 'file_project'])
wide_df['bot_numeric'] = pd.to_numeric(wide_df['bot'], errors='coerce')

file_project_dummies = pd.get_dummies(wide_df['file_project'], prefix='file_project', drop_first=True)

# Add the file_project dummies to the DataFrame
data = pd.concat([wide_df, file_project_dummies], axis=1)

# Specify the dependent variable
y = wide_dm['human']

# Specify the independent variables (including lagged 'human')
X = sm.add_constant(wide_df[['bot', 'human']].shift(1))

# Perform the panel regression with fixed effects
model = sm.OLS(y, X)
fixed_effects_results = model.fit()

# Print the model summary
print(fixed_effects_results.summary())