In [1]:
from sys import path

import numpy as np
import pandas as pd 
from scipy.stats import norm
import matplotlib.pyplot as plt 
import seaborn as sns 
sns.set_theme()

%load_ext autoreload
%autoreload 2

# user-written 
import w8_estimation as est 
import w8_LinearModel as lm
import w8_probit as probit
import w8_logit as logit
import marginal_effects as me


In [2]:
# Outcome label
y_lab = 'anyuseofforce_coded'

# Dataset columns
rawdat_columns = [
    'anyuseofforce_coded',
    
    # Subject (civilian) characteristics
    'sblack',
    'shisp',
    'swhite',
    'smale',
    'sother', 
    'sage',
    'sempl', 
    'sincome',
    'spop', 
    'sbehavior',
    
    # Officer characteristics
    'omajblack',
    'omajhisp',
    'omajwhite',
    'omajother', 
    'osplit', 
    
    # Encounter characteristics
    'daytime',
    'inctype_lin', 
    'year'
]


In [3]:
# Final X-matrix variable labels 
x_lab = [
    # Subject vars (white is reference)
    'swhite',
    #'shisp',
    #'sblack',
    #'smale',
    #'sage',
    #'sincome', 
    #'sempl',
    #'spop', 
    #'inctype_lin',
    #'sbehavior',

    # Officer vars (white is reference)
    #'omajblack', --> Udelukkes fordi der ikke er nogen
    #'omajwhite',
    #'omajother',

    # Encounter vars
    #'daytime'
]

In [4]:
def make_desc(df):
    out = df.describe().T[['mean', 'std']]
    out.index.name = 'Variable'
    return out

In [5]:
dat = pd.read_csv('ppcs_cc.csv')

In [6]:
# Take all numeric variables
sub = dat.select_dtypes(include='number').copy()

# Panel A
panel_all = make_desc(sub)

# Panel B
panel_force = make_desc(sub[sub[y_lab] == 1])

summary = pd.concat(
    {'All observations': panel_all,
     'Any use of force = 1': panel_force},
    axis=1
)

summary_rounded = summary.round(3)
print(summary_rounded)

                    All observations         Any use of force = 1        
                                mean     std                 mean     std
Variable                                                                 
sblack                         0.111   0.314                0.158   0.375
shisp                          0.102   0.302                0.316   0.478
swhite                         0.739   0.439                0.474   0.513
sother                         0.049   0.215                0.053   0.229
smale                          0.530   0.499                0.789   0.419
sage                          41.010  16.147               30.789  11.193
sempl                          0.695   0.460                0.474   0.513
sincome                        2.165   0.848                2.053   0.848
spop                           1.363   0.766                1.947   1.079
daytime                        0.666   0.472                0.474   0.513
inctype_lin                    1.958  

In [7]:
def summary_to_latex(df, filename=None, caption=None, label=None):
    """
    Convert a MultiIndex-column DataFrame like your summary table
    into LaTeX without using pandas.to_latex().
    Works even when jinja2 is missing.
    """
    # Extract level names
    level_0 = df.columns.get_level_values(0)
    level_1 = df.columns.get_level_values(1)

    # Unique panel names (Panel A, Panel B)
    panels = list(dict.fromkeys(level_0))

    # Build LaTeX lines
    lines = []
    lines.append(r"\begin{table}[htbp]")
    lines.append(r"\centering")
    lines.append(r"\begin{tabular}{l" + "r" * len(df.columns) + "}")
    lines.append(r"\hline")
    
    # First header row: panel names
    header1 = [""]  # first empty cell (for row index)
    for p in panels:
        count = sum(level_0 == p)
        header1.append(r"\multicolumn{" + str(count) + r"}{c}{" + p + r"}")
    lines.append(" & ".join(header1) + r" \\")
    
    # Second header row: statistic names (mean, std, min, max)
    header2 = ["Variable"]
    header2 += list(level_1)
    lines.append(" & ".join(header2) + r" \\")
    lines.append(r"\hline")
    
    # Data rows
    for idx, row in df.iterrows():
        row_vals = [idx] + [f"{v:.3f}" for v in row]
        lines.append(" & ".join(row_vals) + r" \\")
    
    lines.append(r"\hline")
    lines.append(r"\end{tabular}")

    if caption:
        lines.append(r"\caption{" + caption + r"}")
    if label:
        lines.append(r"\label{" + label + r"}")

    lines.append(r"\end{table}")

    latex_str = "\n".join(lines)

    if filename is not None:
        with open(filename, "w") as f:
            f.write(latex_str)

    return latex_str


In [8]:
latex_code = summary_to_latex(
    summary_rounded,
    filename="descriptive_table.tex",
    caption="Descriptive statistics",
    label="tab:desc_stats"
)

print(latex_code)


\begin{table}[htbp]
\centering
\begin{tabular}{lrrrr}
\hline
 & \multicolumn{2}{c}{All observations} & \multicolumn{2}{c}{Any use of force = 1} \\
Variable & mean & std & mean & std \\
\hline
sblack & 0.111 & 0.314 & 0.158 & 0.375 \\
shisp & 0.102 & 0.302 & 0.316 & 0.478 \\
swhite & 0.739 & 0.439 & 0.474 & 0.513 \\
sother & 0.049 & 0.215 & 0.053 & 0.229 \\
smale & 0.530 & 0.499 & 0.789 & 0.419 \\
sage & 41.010 & 16.147 & 30.789 & 11.193 \\
sempl & 0.695 & 0.460 & 0.474 & 0.513 \\
sincome & 2.165 & 0.848 & 2.053 & 0.848 \\
spop & 1.363 & 0.766 & 1.947 & 1.079 \\
daytime & 0.666 & 0.472 & 0.474 & 0.513 \\
inctype_lin & 1.958 & 0.200 & 1.684 & 0.478 \\
omajblack & 0.061 & 0.239 & 0.000 & 0.000 \\
omajhisp & 0.024 & 0.153 & 0.053 & 0.229 \\
omajwhite & 0.904 & 0.295 & 0.947 & 0.229 \\
omajother & 0.012 & 0.107 & 0.000 & 0.000 \\
osplit & 0.000 & 0.000 & 0.000 & 0.000 \\
sbehavior & 0.065 & 0.247 & 0.526 & 0.513 \\
year & 2011.000 & 0.000 & 2011.000 & 0.000 \\
anyuseofforce_coded & 0.005 & 

In [9]:
dat = pd.read_csv('ppcs_cc.csv')

N = dat.shape[0]

# reorder columns 
dat = dat[[y_lab] + x_lab].copy()

dat.head(5)

assert dat.notnull().all(axis=1).all(), 'Missings in the dataset, take them out!'

In [10]:
y = dat[y_lab].values
x = dat[x_lab].values
K = x.shape[1]

print("Shape x:", x.shape)
print("Rank x:", np.linalg.matrix_rank(x))
y.shape

Shape x: (3799, 1)
Rank x: 1


(3799,)

In [11]:
# OLS estimates
ols_results = lm.estimate(y, x, robust_se=True)
ols_tab = lm.print_table((y_lab, x_lab), ols_results, title='LPM results')
ols_tab

LPM results
Dependent variable: anyuseofforce_coded

R2 = -0.004
sigma2 = nan


Unnamed: 0,b_hat,se,t
swhite,0.0032,0.0011,3.0048


In [12]:
theta0 = probit.starting_values(y, x)
theta0.ndim==1

True

In [13]:
ll = probit.loglikelihood(theta0, y, x)
np.isclose(np.mean(ll), -1.0411283428047824)

np.False_

In [14]:
#probit_results = est.estimate(probit.q, theta0, y, x)

In [15]:
#probit_tab = est.print_table(x_lab, probit_results, title=f'Probit, y = {y_lab}')
#probit_tab

In [16]:
theta0 = logit.starting_values(y, x)
theta0 

array([0.01282051])

In [17]:
ll = logit.loglikelihood(theta0, y, x)
np.isclose(np.mean(ll),-0.9974267061091704)

np.False_

In [18]:
logit_results = est.estimate(logit.q, theta0, y, x)

Optimization terminated successfully.
         Current function value: 0.196784
         Iterations: 11
         Function evaluations: 24
         Gradient evaluations: 12


In [19]:
logit_tab = est.print_table(x_lab, logit_results, title=f'Logit, y = {y_lab}')
logit_tab

Optimizer succeeded after 11 iter. (24 func. evals.). Final criterion:   0.1968.
Logit, y = anyuseofforce_coded


Unnamed: 0,theta,se,t
swhite,-5.7376,0.3339,-17.1851


In [21]:
# Reference covariate profile used for marginal effects
x_ref = np.array([1], dtype=float)
pd.DataFrame(x_ref.reshape(1, -1), columns=x_lab, index=['x_ref'])


Unnamed: 0,swhite
x_ref,1.0


In [26]:
# Delta-method partial effects for logit across all regressors
b_lg = logit_results['theta']
cov_lg = logit_results['cov']

binary_vars = ['swhite']

me_rows = []

for var in binary_vars:
    idx = x_lab.index('swhite')
    x0 = x_ref.copy()
    x1 = x_ref.copy()
    x0[idx] = 0.0
    x1[idx] = 1.0
    effect, se = me.discrete_effect_delta(b_lg, cov_lg, x0, x1, logit.G)
    me_rows.append({
        'Var': var,
        'Effect (LPM)': ols_results['b_hat'][idx],
        'Effect (Logit)': effect,
        's.e. (Logit)': se,
    })

#for var in continuous_vars:
   # idx = x_lab.index(var)
    #effect, se = me.continuous_effect_delta(b_lg, cov_lg, x_ref, idx, logit.Gprime)
    #me_rows.append({
    #    'Var': var,
    #    'Effect (LPM)': ols_results['b_hat'][idx],
    #    'Effect (Logit)': effect,
    #    's.e. (Logit)': se,
    #})

me_tab = pd.DataFrame(me_rows).set_index('Var')
me_tab['t (Logit)'] = me_tab['Effect (Logit)'] / me_tab['s.e. (Logit)']


In [27]:
me_tab.round(4)


Unnamed: 0_level_0,Effect (LPM),Effect (Logit),s.e. (Logit),t (Logit)
Var,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
swhite,0.0032,-0.4968,0.0011,-464.7107


In [28]:
# Use the estimated logit coefficients already stored in the notebook (b_lg / logit_results['theta'])
X = dat[x_lab].to_numpy()
beta_hat = b_lg.copy()  # use b_lg (or use logit_results['theta'])

# linear index
xb = X @ beta_hat

# predicted probabilities (logit)
P = 1 / (1 + np.exp(-xb))

# partial effects (logit formula)
PE = (P * (1 - P))[:, None] * beta_hat[None, :]

# Average Partial Effects
APE = PE.mean(axis=0)

print("Average Partial Effects (APE):")
for name, val in zip(x_lab, APE):
    print(f"{name:15s}: {val:.4f}")

Average Partial Effects (APE):
swhite         : -0.3878
