In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
import statsmodels.api as sm

In [77]:
df = pd.read_csv('mep_votes_json_final.csv', header=1)
df.head()

Unnamed: 0,mep_id,Position,Party,vote_id,Time_stamp,Procedure_type
0,256810,DID_NOT_VOTE,EPP,176731,2025-05-22T11:38:51,INI
1,257043,FOR,EPP,176731,2025-05-22T11:38:51,INI
2,197490,FOR,EPP,176731,2025-05-22T11:38:51,INI
3,256820,FOR,EPP,176731,2025-05-22T11:38:51,INI
4,256987,AGAINST,Renew,176731,2025-05-22T11:38:51,INI


In [78]:
filtered_df = df[['mep_id', 'Position', 'Party', 'Time_stamp', 'vote_id', 'Procedure_type']]
filtered_df.rename(columns={'Position': 'position', 'Party' : 'party', 'Time_stamp' : 'timestamp', 'Procedure_type' : 'procedure_type'}, inplace=True)
filtered_df.head()

Unnamed: 0,mep_id,position,party,timestamp,vote_id,procedure_type
0,256810,DID_NOT_VOTE,EPP,2025-05-22T11:38:51,176731,INI
1,257043,FOR,EPP,2025-05-22T11:38:51,176731,INI
2,197490,FOR,EPP,2025-05-22T11:38:51,176731,INI
3,256820,FOR,EPP,2025-05-22T11:38:51,176731,INI
4,256987,AGAINST,Renew,2025-05-22T11:38:51,176731,INI


### We are trying to make another dataframe where each row represents a party's aggregate stats on a particular vote id.

We will have the columns: party, vote_id, timestamp, num_for, num_against, num_abstention, num_no_votes, total_votes (does not include no votes), majority_col (string: for, against, or abstain (or no_votes)), majority_votes (number of votes for that party that AGREE w the majority), percent_dissent (percent of for, against, and abstain votes that agree with the MAJORITY)

We have to drop rows where the majority_col is no_votes before calculating percent dissenters

In [79]:
# filter only relevant vote positions
valid_positions = ['FOR', 'AGAINST', 'ABSTENTION', 'DID_NOT_VOTE']
filtered_df = filtered_df[filtered_df['position'].isin(valid_positions)]

# group and count positions by party and vote
vote_counts = (filtered_df.groupby(['vote_id', 'timestamp', 'party', 'procedure_type', 'position']).size().unstack(fill_value=0).reset_index())

# rename vote columns for clarity
vote_counts = vote_counts.rename(columns={'FOR': 'num_for', 'AGAINST': 'num_against', 
                                          'ABSTENTION': 'num_abstention', 'DID_NOT_VOTE': 'num_no_votes'})

# ensure all columns exist
for col in ['num_for', 'num_against', 'num_abstention', 'num_no_votes']:
    if col not in vote_counts.columns:
        vote_counts[col] = 0

# compute total votes (excluding no votes)
vote_counts['total_votes'] = (vote_counts['num_for'] + vote_counts['num_against'] + vote_counts['num_abstention'])

# get relevant vote cols
vote_subset = vote_counts[['num_for', 'num_against', 'num_abstention']]

# get the column with the highest value per row
vote_counts['majority_col'] = vote_subset.idxmax(axis=1).str.replace('num_', '').str.upper()

# get the max value of votes
vote_counts['majority_votes'] = vote_subset.max(axis=1)

# merge majority info back
#df_final = vote_counts.merge(majority_col, on=['vote_id', 'timestamp'])
df_final = vote_counts

# drop rows where majority_col == 'DID_NOT_VOTE'
df_final = df_final[df_final['majority_col'] != 'DID_NOT_VOTE']

# compute majority_votes for this party
def get_majority_votes(row):
    return row[f'num_{row["majority_col"].lower()}']

df_final['majority_votes'] = df_final.apply(get_majority_votes, axis=1)

# compute percent dissenters
df_final['percent_dissent'] = 1 - (df_final['majority_votes'] / df_final['total_votes'])

# optional final ordering
df_final = df_final[[
    'party', 'vote_id', 'procedure_type', 'timestamp', 'num_for', 'num_against', 'num_abstention', 'num_no_votes',
    'total_votes', 'majority_col', 'majority_votes', 'percent_dissent']]

df_final

position,party,vote_id,procedure_type,timestamp,num_for,num_against,num_abstention,num_no_votes,total_votes,majority_col,majority_votes,percent_dissent
0,ECR,135091,RSO,2021-07-08T15:53:14.480000,62,0,0,1,62,FOR,62,0.000000
1,EPP,135091,RSO,2021-07-08T15:53:14.480000,173,0,0,5,173,FOR,173,0.000000
2,GUE/NGL,135091,RSO,2021-07-08T15:53:14.480000,39,0,0,0,39,FOR,39,0.000000
3,Greens/EFA,135091,RSO,2021-07-08T15:53:14.480000,71,0,0,2,71,FOR,71,0.000000
4,ID,135091,RSO,2021-07-08T15:53:14.480000,67,3,0,1,70,FOR,67,0.042857
...,...,...,...,...,...,...,...,...,...,...,...,...
11485,Greens/EFA,176873,COD,2025-05-22T11:09:51,45,0,0,8,45,FOR,45,0.000000
11486,Non-attached,176873,COD,2025-05-22T11:09:51,18,3,2,6,23,FOR,18,0.217391
11487,Patriots for Europe,176873,COD,2025-05-22T11:09:51,58,6,6,15,70,FOR,58,0.171429
11488,Renew,176873,COD,2025-05-22T11:09:51,64,0,0,11,64,FOR,64,0.000000


In [None]:
# sort by party (primary) then timestamp within that?

# do i filter out rows with 0% dissent? or change them to 0.001 or smth? bc i don't know if that affects the func

In [81]:
set(df_final['party'])

{'ECR',
 'EPP',
 'ESN',
 'GUE/NGL',
 'Greens/EFA',
 'ID',
 'Non-attached',
 'Patriots for Europe',
 'Renew',
 'S&D'}

In [None]:
"""sns.set_theme(style="darkgrid")
sns.color_palette("Set2")

df_renew = df_final[df_final['party'] == 'Renew']

plt.figure(figsize=(12, 6))
sns.lineplot(data=df_renew, x='timestamp', y='percent_dissent', label='party')

plt.xlabel('Date')
plt.ylabel('Percent Dissenters')
plt.title('Party Cohesion Over Time')
plt.legend()
plt.show()"""

In [83]:
df_final.to_csv('time_series_data_final.csv', index=False) 

In [84]:
type(df_final['timestamp'].iloc[0])

str

In [85]:
df_final['timestamp'] = pd.to_datetime(df_final['timestamp'], format='ISO8601')
type(df_final['timestamp'].iloc[0])

pandas._libs.tslibs.timestamps.Timestamp

In [97]:
df_time = df_final
df_time['year'] = df_time['timestamp'].dt.year
df_time['month'] = df_time['timestamp'].dt.month
df_time['weekday'] = df_time['timestamp'].dt.weekday  # 0 = Monday
df_time['weekofyear'] = df_time['timestamp'].dt.isocalendar().week

In [100]:
X_np = sm.add_constant(pd.get_dummies(df_time[['party', 'procedure_type', 'month', 'weekday']], drop_first=True)).astype(float).values
y_np = (df_time['percent_dissent'])

# w shape must match number of features
w = np.random.randn(X_np.shape[1])


# write up a function that does this for a specified number of iterations
def logistic_regression(X, y, w, alpha=1, max_iter=None):
    runalg = True
    i = 0
    iter = 0
    
    while runalg:
        x = X[i, :]
        yhat = 1 / (1 + np.exp(-np.dot(x, w)))

        w = w - alpha * (yhat - y[i]) * x
        i += 1

        if i == len(y):
            i = 0
            iter += 1

        if max_iter is not None and iter == max_iter:
            runalg = False
            print(f"Algorithm reached max_iter, final w = {w}")
            print(f"Total number of iterations = {iter}")
    
    return w

run = logistic_regression(X_np, y_np, w, alpha=0.007, max_iter=2000)

X_np = sm.add_constant(pd.get_dummies(df_time[['party', 'procedure_type', 'month', 'weekday']], drop_first=True)).astype(float).values
y_np = df_time['percent_dissent']

w = np.random.randn(X_np.shape[1])
w_final = logistic_regression(X_np, y_np, w, alpha=0.007, max_iter=2000)

y_pred = 1 / (1 + np.exp(-np.dot(X_np, w_final)))
y_pred_percent = y_pred * 100
y_pred_percent

Algorithm reached max_iter, final w = [-2.04275899  0.03553143  0.03336404 -1.24287643 -0.7438415  -0.08601317
 -2.05345352  0.50734912  0.80336152 -0.05918224 -1.39672958 -1.95358257
  0.78263244  0.15433497  0.99611821  0.34678079  0.41114779  0.73765169
  0.10247695  0.76446201  0.86786987  0.25717012  0.04695013  0.75210026
 -0.32601003  0.55572063]
Total number of iterations = 2000
Algorithm reached max_iter, final w = [-2.06443168  0.03550863  0.03344011 -1.24287051 -0.74385914 -0.08600516
 -2.05344858  0.5073683   0.8033721  -0.059202   -1.39672331 -1.95357607
  0.80467248  0.17595751  1.01747407  0.36844981  0.43274434  0.75935265
  0.1240881   0.78601986  0.88955137  0.27875116  0.06884284  0.77400434
 -0.30340095  0.57717924]
Total number of iterations = 2000


array([11.72274122,  3.69043824, 10.86159559, ..., 19.57572384,
        6.00557557,  3.53181987])

In [101]:
print(len(y_pred_percent), len(y_np))
resids = y_pred_percent - (y_np * 100)
resids.mean()

11490 11490


3.0013925308001563

In [102]:
r2 = r2_score((y_np * 100), y_pred_percent)
r2

0.33838587773287576

In [103]:
y_pred_percent

array([11.72274122,  3.69043824, 10.86159559, ..., 19.57572384,
        6.00557557,  3.53181987])