In [22]:
import pandas as pd
import plotly.express as px
import data_preprocessing.data_preprocess as dp
import numpy as np
from scipy.stats import gaussian_kde
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from scipy.spatial.distance import cdist
from scipy.stats import multivariate_normal
from scipy.spatial.distance import cdist

In [23]:
policy_vars = [
    "Military: Positive",
    "European Community/Union: Positive",
    "Freedom and Human Rights",
    "Democracy",
    "Political Corruption",
    "Environmental Protection",
    "Welfare State",
    "Right-left position",
    "Planned Economy",
    "Equality: Positive",
    "Opposition to Immigration"]

In [24]:
# Creating dataset on certain variables
party_scaled, voter_scaled = dp.get_scaled_party_voter_data(x_var='Opposition to Immigration', y_var='Welfare State',year=2021)


Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.


errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


Requested year 2021 not found in party data; falling back to most recent year '2021'.



In [25]:
party_scaled

Unnamed: 0,Country,Date,Calendar_Week,Party_Name,Opposition to Immigration,Welfare State,Opposition to Immigration Scaled,Welfare State Scaled,Opposition to Immigration Voters_Mean,Welfare State Voters_Mean,Opposition to Immigration Combined,Welfare State Combined,Label
0,Germany,26/09/2021,202109,90/Greens,0.076,19.342,-0.501716,0.150875,-0.728266,0.617838,-0.682956,0.524446,90/Greens
1,Germany,26/09/2021,202109,LINKE,0.021,28.427,-0.534461,1.429001,-0.592097,0.817171,-0.58057,0.939537,LINKE
2,Germany,26/09/2021,202109,SPD,0.0,22.414,-0.546963,0.58306,0.072262,0.027329,-0.051583,0.138475,SPD
3,Germany,26/09/2021,202109,FDP,0.222,14.222,-0.414794,-0.569434,0.244118,-0.618355,0.112336,-0.608571,FDP
4,Germany,26/09/2021,202109,CDU/CSU,1.009,10.703,0.053752,-1.064505,0.187892,-0.307672,0.161064,-0.459039,CDU/CSU
5,Germany,26/09/2021,202109,AfD,4.956,7.687,2.403628,-1.488812,1.158858,-0.700003,1.407812,-0.857765,AfD


In [26]:
voter_scaled

Unnamed: 0,Opposition to Immigration,Welfare State,who did you vote for:second vote,year of birth,"do you incline towards a party, if so which one",how strongly do you incline towards this party,Party_Name,party_choice,Opposition to Immigration Scaled,Welfare State Scaled,Label
0,9.0,1.0,4.0,1996,4.0,3.0,SPD,0,0.991786,-2.115339,Voter
1,6.0,6.0,5.0,2001,5.0,2.0,FDP,1,-0.107476,0.066448,Voter
2,3.0,8.0,6.0,1991,6.0,3.0,90/Greens,2,-1.206738,0.939163,Voter
3,4.0,9.0,4.0,1983,4.0,2.0,SPD,0,-0.840317,1.375520,Voter
4,8.0,5.0,1.0,1964,5.0,3.0,CDU/CSU,3,0.625365,-0.369909,Voter
...,...,...,...,...,...,...,...,...,...,...,...
3285,9.0,5.0,5.0,1958,5.0,3.0,FDP,1,0.991786,-0.369909,Voter
3286,11.0,6.0,4.0,1940,5.0,2.0,SPD,0,1.724627,0.066448,Voter
3287,7.0,5.0,5.0,1944,1.0,2.0,FDP,1,0.258945,-0.369909,Voter
3288,11.0,2.0,4.0,1949,4.0,3.0,SPD,0,1.724627,-1.678981,Voter


In [27]:
import plotly.io as pio
# pio.renderers.default = 'iframe'

fig = px.scatter(
    pd.concat([
        voter_scaled.assign(Type="Voter", Size=5, Color="Voter"),
        party_scaled.assign(Type="Party", Size=15, Color=party_scaled["Party_Name"])
    ]),
    x="Opposition to Immigration",
    y="Welfare State",
    color="Color",
    symbol="Type",
    size="Size",
    title="Unscaled Voter and Party Positions"
)
fig.update_traces(textposition="top center")
fig.show()

In [28]:
concatenated_df = pd.concat([voter_scaled, party_scaled], ignore_index=True)

fig = px.scatter(
    concatenated_df,
    x='Opposition to Immigration Scaled',
    y='Welfare State Scaled',
    color='Label',
    symbol='Label')
fig.update_traces(marker=dict(size=10))
fig.update_layout(title='Scaled Voter and Party Positions')
fig.show()

In [29]:
from scipy.stats import gaussian_kde
import numpy as np

x_var = "Opposition to Immigration"
y_var = "Welfare State"

x = voter_scaled[f"{x_var} Scaled"].values
y = voter_scaled[f"{y_var} Scaled"].values

data = np.vstack([x, y])

kde = gaussian_kde(data, bw_method='scott')

density_at_5_5 = kde([5, 5])


In [30]:
def voter_density(x_input, y_input):

    xy = np.vstack([np.ravel(x_input), np.ravel(y_input)])
    density_vals = kde(xy)
    return density_vals.reshape(np.shape(x_input))

In [31]:
voter_density(5, 5)

X, Y = np.meshgrid(np.linspace(0, 10, 100), np.linspace(0, 10, 100))
Z = voter_density(X, Y)

In [32]:
from sklearn.mixture import GaussianMixture
import numpy as np

X = voter_scaled[[f"{x_var} Scaled", f"{y_var} Scaled"]].values

# GMM
gmm = GaussianMixture(n_components=3, covariance_type='full', random_state=0)
gmm.fit(X)

In [33]:
from scipy.stats import multivariate_normal

def gmm_density(x_input, y_input):

    x_flat = np.ravel(x_input)
    y_flat = np.ravel(y_input)
    points = np.column_stack([x_flat, y_flat])
    
    density_vals = np.zeros(len(points))
    for weight, mean, cov in zip(gmm.weights_, gmm.means_, gmm.covariances_):
        rv = multivariate_normal(mean=mean, cov=cov)
        density_vals += weight * rv.pdf(points)
    
    return density_vals.reshape(np.shape(x_input))

In [34]:
Xgrid, Ygrid = np.meshgrid(np.linspace(0, 10, 100), np.linspace(0, 10, 100))
Z = gmm_density(Xgrid, Ygrid)

In [35]:
print("Weights:", gmm.weights_)
print("Means:\n", gmm.means_)
print("Covariances:\n", gmm.covariances_)

Weights: [0.52664387 0.20426783 0.2690883 ]
Means:
 [[-0.2274056  -0.14139101]
 [-1.03038878  1.04282053]
 [ 1.22724415 -0.51489408]]
Covariances:
 [[[ 0.40336256  0.04266264]
  [ 0.04266264  0.52845943]]

 [[ 0.37798564 -0.08458569]
  [-0.08458569  0.54685684]]

 [[ 0.22659824 -0.06396446]
  [-0.06396446  1.13710551]]]


In [36]:
from scipy.stats import multivariate_normal
import numpy as np

def gmm_indefinite_integral(x, y):
    total_cdf = 0
    point = np.array([x, y])
    for w, mu, cov in zip(gmm.weights_, gmm.means_, gmm.covariances_):
        total_cdf += w * multivariate_normal.cdf(point, mean=mu, cov=cov)
    return total_cdf

In [37]:
from scipy.stats import multivariate_normal

def gmm_density_and_loggrad(x_input, y_input, gmm):
    x_flat = np.ravel(x_input)
    y_flat = np.ravel(y_input)
    points = np.column_stack([x_flat, y_flat])
    N = len(points)

    density_vals = np.zeros(N)
    grad = np.zeros_like(points)

    for weight, mean, cov in zip(gmm.weights_, gmm.means_, gmm.covariances_):
        rv = multivariate_normal(mean=mean, cov=cov, allow_singular=True)
        pdf_vals = rv.pdf(points)
        diff = points - mean
        inv_cov = np.linalg.pinv(cov) 
        grad_comp = -pdf_vals[:, None] * (diff @ inv_cov.T)

        density_vals += weight * pdf_vals
        grad += weight * grad_comp

    eps = 1e-9
    grad_log_density = grad / (density_vals[:, None] + eps)

    return grad_log_density

In [38]:
def reflect(val, low, high):
    range_size = high - low
    val_shifted = (val - low) % (2 * range_size)
    reflected = np.where(val_shifted < range_size, val_shifted, 2 * range_size - val_shifted)
    return reflected + low

In [39]:
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.mixture import GaussianMixture

def run_simulation(data, T, sigma_noise, gmm_components, alpha, beta, gamma, random_seed=42):
    
    np.random.seed(random_seed)
    
    D, N = data.shape
    history = [data.copy()]

    for t in range(T):
        X_t = history[-1]

        X_t_noisy = X_t.T + np.random.normal(scale=1e-6, size=(N, D))

        gmm = GaussianMixture(n_components=gmm_components, covariance_type='full', reg_covar=1e-2)
        gmm.fit(X_t_noisy)

        distances = cdist(X_t_noisy, X_t_noisy, metric='euclidean')
        W = np.exp(-distances ** 2)
        W /= W.sum(axis=1, keepdims=True)

        weighted_sum = W @ X_t_noisy 
        F_x = gmm_density_and_loggrad(X_t[0, :], X_t[1, :], gmm) 

        noise = np.random.normal(0, sigma_noise, size=(N, D))

        X_next = alpha * weighted_sum - beta * F_x + gamma * noise

        X_next = np.clip(X_next, -3, 3)

        for dim in range(D):
            mask_low = X_next[:, dim] <= -3
            X_next[mask_low, dim] = -3 + (-3 - X_next[mask_low, dim])
            mask_high = X_next[:, dim] >= 3
            X_next[mask_high, dim] = 3 - (X_next[mask_high, dim] - 3)

        history.append(X_next.T)

    final_positions = history[-1]
    return final_positions


In [40]:
import numpy as np
import plotly.express as px

def plot_with_simulation_separate(concatenated_df, simulation_points):

    print("Data ranges and checks:")
    print("Opposition to Immigration Scaled min/max:", concatenated_df['Opposition to Immigration Scaled'].min(), concatenated_df['Opposition to Immigration Scaled'].max())
    print("Welfare State Scaled min/max:", concatenated_df['Welfare State Scaled'].min(), concatenated_df['Welfare State Scaled'].max())
    
    sim_x = np.array(simulation_points[0])
    sim_y = np.array(simulation_points[1])
    
    print("Simulation X min/max:", np.min(sim_x), np.max(sim_x))
    print("Simulation Y min/max:", np.min(sim_y), np.max(sim_y))
    
    print("Any NaNs or infs in simulation X?", np.isnan(sim_x).any(), np.isinf(sim_x).any())
    print("Any NaNs or infs in simulation Y?", np.isnan(sim_y).any(), np.isinf(sim_y).any())
    
    def clip_data(arr, min_val=-1e3, max_val=1e3):
        arr = np.clip(arr, min_val, max_val)
        return arr
    
    sim_x = clip_data(sim_x)
    sim_y = clip_data(sim_y)
    
    fig = px.scatter(
        concatenated_df,
        x='Opposition to Immigration Scaled',
        y='Welfare State Scaled',
        color='Label',
        symbol='Label'
    )
    
    fig.add_scatter(
        x=sim_x,
        y=sim_y,
        mode='markers',
        marker=dict(
            color='rgba(0,0,0,0.2)',
            size=4,
            symbol='circle'
        ),
        name='Simulation Points'
    )
    
    xmin = min(concatenated_df['Opposition to Immigration Scaled'].min(), np.min(sim_x))
    xmax = max(concatenated_df['Opposition to Immigration Scaled'].max(), np.max(sim_x))
    ymin = min(concatenated_df['Welfare State Scaled'].min(), np.min(sim_y))
    ymax = max(concatenated_df['Welfare State Scaled'].max(), np.max(sim_y))
    
    padding_x = (xmax - xmin) * 0.1
    padding_y = (ymax - ymin) * 0.1
    
    fig.update_layout(
        title='Scaled Positions with Simulation Overlay',
        xaxis=dict(range=[xmin - padding_x, xmax + padding_x]),
        yaxis=dict(range=[ymin - padding_y, ymax + padding_y]),
    )
    
    return fig


In [41]:
N = x.__len__()             
D = 2                
T = 500               
sigma_noise = 0.1
gmm_components = 3

In [42]:
import plotly.io as pio
# pio.renderers.default = 'iframe'

for i in range(10):
    sim=run_simulation(data,i,sigma_noise,gmm_components,0.01,1,0.1, 42)
    fig = plot_with_simulation_separate(concatenated_df,sim)
    fig.show()

Data ranges and checks:
Opposition to Immigration Scaled min/max: -1.9395789200332214 2.403628361755519
Welfare State Scaled min/max: -2.115338518534899 2.2482352014348908
Simulation X min/max: -1.9395789200332214 1.7246270213444983
Simulation Y min/max: -2.115338518534899 2.2482352014348908
Any NaNs or infs in simulation X? False False
Any NaNs or infs in simulation Y? False False


Data ranges and checks:
Opposition to Immigration Scaled min/max: -1.9395789200332214 2.403628361755519
Welfare State Scaled min/max: -2.115338518534899 2.2482352014348908
Simulation X min/max: -3.0 2.8131577583348366
Simulation Y min/max: -3.0 3.0
Any NaNs or infs in simulation X? False False
Any NaNs or infs in simulation Y? False False


Data ranges and checks:
Opposition to Immigration Scaled min/max: -1.9395789200332214 2.403628361755519
Welfare State Scaled min/max: -2.115338518534899 2.2482352014348908
Simulation X min/max: -3.0 3.0
Simulation Y min/max: -3.0 3.0
Any NaNs or infs in simulation X? False False
Any NaNs or infs in simulation Y? False False


Data ranges and checks:
Opposition to Immigration Scaled min/max: -1.9395789200332214 2.403628361755519
Welfare State Scaled min/max: -2.115338518534899 2.2482352014348908
Simulation X min/max: -3.0 3.0
Simulation Y min/max: -3.0 3.0
Any NaNs or infs in simulation X? False False
Any NaNs or infs in simulation Y? False False


Data ranges and checks:
Opposition to Immigration Scaled min/max: -1.9395789200332214 2.403628361755519
Welfare State Scaled min/max: -2.115338518534899 2.2482352014348908
Simulation X min/max: -3.0 3.0
Simulation Y min/max: -2.7471221960670404 3.0
Any NaNs or infs in simulation X? False False
Any NaNs or infs in simulation Y? False False


Data ranges and checks:
Opposition to Immigration Scaled min/max: -1.9395789200332214 2.403628361755519
Welfare State Scaled min/max: -2.115338518534899 2.2482352014348908
Simulation X min/max: -3.0 3.0
Simulation Y min/max: -3.0 3.0
Any NaNs or infs in simulation X? False False
Any NaNs or infs in simulation Y? False False


Data ranges and checks:
Opposition to Immigration Scaled min/max: -1.9395789200332214 2.403628361755519
Welfare State Scaled min/max: -2.115338518534899 2.2482352014348908
Simulation X min/max: -2.6927828185242237 2.685812100582024
Simulation Y min/max: -2.0867908960698234 2.7369471886262215
Any NaNs or infs in simulation X? False False
Any NaNs or infs in simulation Y? False False


Data ranges and checks:
Opposition to Immigration Scaled min/max: -1.9395789200332214 2.403628361755519
Welfare State Scaled min/max: -2.115338518534899 2.2482352014348908
Simulation X min/max: -3.0 3.0
Simulation Y min/max: -3.0 3.0
Any NaNs or infs in simulation X? False False
Any NaNs or infs in simulation Y? False False


Data ranges and checks:
Opposition to Immigration Scaled min/max: -1.9395789200332214 2.403628361755519
Welfare State Scaled min/max: -2.115338518534899 2.2482352014348908
Simulation X min/max: -3.0 3.0
Simulation Y min/max: -2.6492035237120706 2.7661323111004963
Any NaNs or infs in simulation X? False False
Any NaNs or infs in simulation Y? False False


Data ranges and checks:
Opposition to Immigration Scaled min/max: -1.9395789200332214 2.403628361755519
Welfare State Scaled min/max: -2.115338518534899 2.2482352014348908
Simulation X min/max: -3.0 3.0
Simulation Y min/max: -3.0 3.0
Any NaNs or infs in simulation X? False False
Any NaNs or infs in simulation Y? False False


In [43]:
years = ["2009", "2013", "2017", "2021"]
voter_data_by_year = {}

for year in years:
    party_scaled, voter_scaled = dp.get_scaled_party_voter_data(
        x_var='Opposition to Immigration',
        y_var='Welfare State',
        year=year
    )

    voter_coords = voter_scaled[['Opposition to Immigration Scaled', 'Welfare State Scaled']].to_numpy().T
    
    voter_data_by_year[year] = voter_coords

sorted_years = sorted(voter_data_by_year.keys())
yearly_voter_data = [voter_data_by_year[year] for year in sorted_years]


Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.


errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



KeyboardInterrupt: 

In [None]:
from scipy.optimize import minimize
from scipy.spatial import cKDTree
from scipy.stats import wasserstein_distance

def avg_nearest_neighbor(X1, X2):
    tree = cKDTree(X2)
    dists, _ = tree.query(X1, k=1)
    return np.mean(dists)

def print_progress(xk):
    print(f"🔄 Current Params — Alpha: {xk[0]:.4f}, Beta: {xk[1]:.4f}, Gamma: {xk[2]:.4f}")

def objective(params, yearly_data, T_guess, sigma_noise, gmm_components, use_wasserstein):
    alpha, beta, gamma = params
    print(f"🔍 Evaluating Objective: alpha={alpha:.4f}, beta={beta:.4f}, gamma={gamma:.4f}")
    total_divergence = 0.0

    for i in range(len(yearly_data) - 1):
        X_start = yearly_data[i]
        X_real = yearly_data[i + 1]

        X_sim = run_simulation(
            data=X_start,
            T=T_guess,
            sigma_noise=sigma_noise,
            gmm_components=gmm_components,
            alpha=alpha,
            beta=beta,
            gamma=gamma,
            random_seed=42
        )

        if use_wasserstein:

            wd_x = wasserstein_distance(X_real[0], X_sim[0])
            wd_y = wasserstein_distance(X_real[1], X_sim[1])
            divergence = wd_x + wd_y
        else:

            gmm_sim = GaussianMixture(n_components=gmm_components, covariance_type='full', reg_covar=1e-2).fit(X_sim.T)
            log_likelihood = gmm_sim.score(X_real.T)
            divergence = -log_likelihood

        total_divergence += divergence

    print(f"🔍 Evaluating Objective: alpha={alpha:.4f}, beta={beta:.4f}, gamma={gamma:.4f} | 🧮 Divergence: {total_divergence:.4f}")
    return total_divergence


D, N = yearly_voter_data[0].shape
T_guess = 50
sigma_noise = 0.1
gmm_components = 3

initial_params = [0, 0, 0]

use_wasserstein = True

result = minimize(
    objective,
    initial_params,
    args=(yearly_voter_data, T_guess, sigma_noise, gmm_components, use_wasserstein),
    method='L-BFGS-B',
    bounds=[(0.0, 10.0), (0.0, 10.0), (0.0, 2.0)],
    callback=print_progress,
    options={'disp': True, 'maxiter': 200}
)

alpha_fit, beta_fit, gamma_fit = result.x
print("\n✅ Fitted Parameters:")
print(f"  Alpha: {alpha_fit:.4f}")
print(f"  Beta:  {beta_fit:.4f}")
print(f"  Gamma: {gamma_fit:.4f}")

🔍 Evaluating Objective: alpha=0.0000, beta=0.0000, gamma=0.0000
🔍 Evaluating Objective: alpha=0.0000, beta=0.0000, gamma=0.0000 | 🧮 Divergence: 4.7325
🔍 Evaluating Objective: alpha=0.0000, beta=0.0000, gamma=0.0000


In [None]:
import plotly.io as pio
pio.renderers.default = 'iframe'

sim=run_simulation(data,10,sigma_noise,gmm_components,-0.0842,-0.6219,2, 42)
fig = plot_with_simulation_separate(concatenated_df,sim)
fig.show()

Data ranges and checks:
Opposition to Immigration Scaled min/max: -1.9395789200332214 2.403628361755519
Welfare State Scaled min/max: -2.115338518534899 2.2482352014348908
Simulation X min/max: -3.0 3.0
Simulation Y min/max: -3.0 2.154018924782811
Any NaNs or infs in simulation X? False False
Any NaNs or infs in simulation Y? False False
