In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import minimize

In [2]:
data = pd.read_stata('C:\\Users\\bayle\\Documents\\HW4.dta')
display(data)

Unnamed: 0,person_ID,year,log_wage
0,4.0,1982,9.615806
1,5.0,1974,9.479604
2,5.0,1975,9.384294
3,5.0,1976,7.228388
4,5.0,1977,9.428673
...,...,...,...
33774,35355.0,1975,9.026418
33775,35355.0,1976,8.780787
33776,35355.0,1977,9.086702
33777,35356.0,1975,8.348064


In [3]:
variables_list = data.columns.tolist()
print(variables_list)

['person_ID', 'year', 'log_wage']


In [3]:
variance_by_year = data.groupby('year')['log_wage'].var()
covariance_across_years = data.pivot_table(index='person_ID', columns='year', values='log_wage').cov()
#display(covariance_across_years)
data_moments = covariance_across_years.reset_index()
data_moments = data_moments.iloc[:,1:].values
data_moments = np.tril(data_moments)
data_moments = data_moments.reshape(-1,1)
data_moments = data_moments[data_moments!=0].reshape(-1,1)
#display(data_moments)

## VVD simulation

In [4]:
def simulate_income_data(p, num_individuals, num_periods):

    var_alpha, rho, var_eta, var_epsilon = p

    np.random.seed(4) # VVD
    
    # Simulate individual-specific fixed effects
    individual_fixed_effects = np.random.normal(scale=np.sqrt(var_alpha), size=num_individuals)
    
    # Initialize DataFrame to store simulated income data
    simulated_data = pd.DataFrame(index=range(num_individuals), columns=range(num_periods))
    
    # Simulate income data for each individual
    for i in range(num_individuals):

        # Initialise the autoregressive process
        z_process = 0
        
        # Generate income data for each period
        for t in range(num_periods):
            
            # Simulate shock to the autoregressive process
            eta = np.random.normal(scale=np.sqrt(var_eta))
            
            # Update autoregressive process
            z_process = rho * z_process + eta
            
            # Simulate income for the current period
            income = individual_fixed_effects[i] + z_process + np.random.normal(scale=np.sqrt(var_epsilon))
            
            # Store simulated income in DataFrame
            simulated_data.at[i, t] = income
    
    cov_matrix = simulated_data.cov()

    simulated_moments = cov_matrix.values

    simulated_moments = np.tril(simulated_moments)

    simulated_moments = simulated_moments.reshape(-1,1)

    simulated_moments = simulated_moments[simulated_moments!=0].reshape(-1,1) 

    return simulated_moments

In [5]:
# Simulate income data
params = [0.1,0.8,0.1,0.1]
simulated_moments = simulate_income_data(params, num_individuals=1000, num_periods=19)

In [6]:
def metric(p, data_moments, num_individuals, num_periods):

    simulated_moments = simulate_income_data(p, num_individuals, num_periods)

    distance = simulated_moments - data_moments

    dist = np.dot(distance.T, distance)
    
    return dist

In [7]:
dist = metric(params, data_moments, 1000, 19)
print(dist)

[[2.21436166]]


In [8]:
def SMM(p, num_individuals, num_periods, data_moments):

    bounds = [(0, 1), (0, 1), (0, 1), (0, 1)]

    result = minimize(metric, p, args=(data_moments, num_individuals, num_periods), method='Nelder-Mead', bounds=bounds)

    matched_coefficients = result.x

    return matched_coefficients

In [9]:
initial_params=[0.5,0.9,0.5,0.5]
initial_params_array = np.array(initial_params)

In [244]:
coeffs = SMM(initial_params_array, 1000, 19, data_moments)
print(coeffs)

[0.13803823 0.91605702 0.05730137 0.20404534]


## Now that the code runs, let's run it using 11 different values for the seed (1 of which is the seed used above)

In [10]:
def simulate_income_data_seeded(p, num_individuals, num_periods, seed):

    var_alpha, rho, var_eta, var_epsilon = p

    np.random.seed(seed) # VVD
    
    # Simulate individual-specific fixed effects
    individual_fixed_effects = np.random.normal(scale=np.sqrt(var_alpha), size=num_individuals)
    
    # Initialize DataFrame to store simulated income data
    simulated_data = pd.DataFrame(index=range(num_individuals), columns=range(num_periods))
    
    # Simulate income data for each individual
    for i in range(num_individuals):

        # Initialise the autoregressive process
        z_process = 0
        
        # Generate income data for each period
        for t in range(num_periods):
            
            # Simulate shock to the autoregressive process
            eta = np.random.normal(scale=np.sqrt(var_eta))
            
            # Update autoregressive process
            z_process = rho * z_process + eta
            
            # Simulate income for the current period
            income = individual_fixed_effects[i] + z_process + np.random.normal(scale=np.sqrt(var_epsilon))
            
            # Store simulated income in DataFrame
            simulated_data.at[i, t] = income
    
    cov_matrix = simulated_data.cov()

    simulated_moments = cov_matrix.values

    simulated_moments = np.tril(simulated_moments)

    simulated_moments = simulated_moments.reshape(-1,1)

    simulated_moments = simulated_moments[simulated_moments!=0].reshape(-1,1) 

    return simulated_moments

In [11]:
def metric_seeded(p, data_moments, num_individuals, num_periods, seed):

    simulated_moments = simulate_income_data_seeded(p, num_individuals, num_periods, seed)

    distance = simulated_moments - data_moments

    dist = np.dot(distance.T, distance)
    
    return dist

In [12]:
def SMM_seeded(p, num_individuals, num_periods, data_moments, seed):

    bounds = [(0, 1), (0, 1), (0, 1), (0, 1)]

    result = minimize(metric_seeded, p, args=(data_moments, num_individuals, num_periods, seed), method='Nelder-Mead', bounds=bounds)

    matched_coefficients = result.x

    return matched_coefficients

In [59]:
coeffs_1 = SMM_seeded(initial_params_array, 1000, 19, data_moments, 26)
print(coeffs_1)

[0.14918316 0.90575493 0.05983584 0.2040758 ]


In [56]:
seeds = [ 1, 3, 4, 5, 8, 9, 10, 11, 20, 26, 66]

In [57]:
def seeds_loop(s, p, num_inds, num_pds, data):

    seeds = np.array(s)

    values = np.zeros(shape=(11, 4))

    for i in range(len(seeds)):

        values[i,:] = SMM_seeded(p, num_inds, num_pds, data, seeds[i])

    return values

In [58]:
vals = seeds_loop(seeds, params, 1000, 19, data_moments)
print(vals)

[[0.13899356 0.89841806 0.06233006 0.19873708]
 [0.15572182 0.87074566 0.07022673 0.18661908]
 [0.13805628 0.91609638 0.05726215 0.20407815]
 [0.12391966 0.88979949 0.06694634 0.1860586 ]
 [0.11830516 0.89061022 0.06577114 0.1942395 ]
 [0.13508864 0.87550667 0.06613267 0.19380673]
 [0.16287631 0.86786999 0.07039855 0.18847897]
 [0.12760293 0.87042895 0.06840463 0.18546852]
 [0.12284117 0.90647172 0.05680831 0.20327515]
 [0.14923021 0.90567062 0.05987165 0.20406751]
 [0.15563031 0.87970576 0.06778917 0.18906245]]


## Work in progress from here on out.... trying to get weighting matrix working correctly

In [35]:
newer_variance_by_year = data.groupby('year')['log_wage'].var()
newer_covariance_across_years = data.pivot_table(index='person_ID', columns='year', values='log_wage').cov()
newer_data_moments_raw = covariance_across_years.reset_index()
newer_data_moments_raw = newer_data_moments_raw.iloc[:,1:].values
newer_data_moments_raw = np.tril(newer_data_moments_raw)
#display(newer_data_moments_raw)
newer_weighting_matrix=np.zeros(shape=(190,190))
newer_weighting_matrix[np.diag_indices_from(newer_weighting_matrix)] = newer_data_moments_raw[newer_data_moments_raw != 0]
#display(weighting_matrix.shape)
newer_weighting_matrix = np.where(newer_weighting_matrix != 0, 1. / newer_weighting_matrix, 0)
#display(newer_weighting_matrix)

  newer_weighting_matrix = np.where(newer_weighting_matrix != 0, 1. / newer_weighting_matrix, 0)


In [39]:
def simulate_income_data_newer(p, num_individuals, num_periods, seed):

    var_alpha, rho, var_eta, var_epsilon = p

    np.random.seed(seed) # VVD
    
    # Simulate individual-specific fixed effects
    individual_fixed_effects = np.random.normal(scale=np.sqrt(var_alpha), size=num_individuals)
    
    # Initialize DataFrame to store simulated income data
    simulated_data = pd.DataFrame(index=range(num_individuals), columns=range(num_periods))
    
    # Simulate income data for each individual
    for i in range(num_individuals):

        # Initialise the autoregressive process
        z_process = 0
        
        # Generate income data for each period
        for t in range(num_periods):
            
            # Simulate shock to the autoregressive process
            eta = np.random.normal(scale=np.sqrt(var_eta))
            
            # Update autoregressive process
            z_process = rho * z_process + eta
            
            # Simulate income for the current period
            income = individual_fixed_effects[i] + z_process + np.random.normal(scale=np.sqrt(var_epsilon))
            
            # Store simulated income in DataFrame
            simulated_data.at[i, t] = income
    
    cov_matrix = simulated_data.cov()

    simulated_moments = cov_matrix.values

    simulated_moments = np.tril(simulated_moments)

    simulated_moments = simulated_moments.reshape(-1,1)

    simulated_moments = simulated_moments[simulated_moments!=0].reshape(-1,1) 

    return simulated_moments

In [49]:
def metric_newer(p, data_moments, num_individuals, num_periods, seed, newer_weighting_matrix):

    simulated_moments = simulate_income_data_seeded(p, num_individuals, num_periods, seed)

    distance = simulated_moments - data_moments

    out = np.matmul(distance.T, newer_weighting_matrix)

    dist = np.matmul(out, distance)
    
    return dist

In [50]:
def SMM_newer(p, num_individuals, num_periods, data_moments, seed, newer_weighting_matrix):

    bounds = [(0, 1), (0, 1), (0, 1), (0, 1)]

    result = minimize(metric_newer, p, args=(data_moments, num_individuals, num_periods, seed, newer_weighting_matrix), method='Nelder-Mead', bounds=bounds)

    matched_coefficients = result.x

    return matched_coefficients

In [53]:
coeffs_newer = SMM_newer(initial_params_array, 1000, 19, data_moments, 66, newer_weighting_matrix)
print(coeffs_newer)

[0.15684932 0.87026719 0.07139512 0.16982923]


In [60]:
def seeds_loop_newer(s, p, num_inds, num_pds, data, newer_weighting_matrix):

    seeds = np.array(s)

    vals = np.zeros(shape=(11, 4))

    for i in range(len(seeds)):

        vals[i,:] = SMM_newer(p, num_inds, num_pds, data, seeds[i], newer_weighting_matrix)

    return vals

In [61]:
vals_newer = seeds_loop_newer(seeds, params, 1000, 19, data_moments, newer_weighting_matrix)
print(vals_newer)

[[0.1419799  0.89097232 0.06435482 0.18272868]
 [0.15446919 0.8683401  0.0712595  0.17374009]
 [0.14011806 0.91310592 0.05734853 0.19487085]
 [0.12234794 0.88782858 0.06804198 0.17524335]
 [0.12067007 0.8839199  0.06779949 0.1802715 ]
 [0.1338009  0.87303392 0.06721523 0.17912126]
 [0.1608566  0.86527094 0.07200634 0.17066166]
 [0.12822756 0.86511209 0.07029822 0.17198074]
 [0.1240573  0.90327049 0.05710609 0.19052547]
 [0.15215585 0.90115104 0.06034008 0.19278856]
 [0.15683588 0.87027028 0.07138808 0.16986306]]
