# Anomalies in RU2020 vote

Good map of regions:
https://regnum.ru/uploads/pictures/news/2019/03/11/regnum_picture_15523183991313502_normal.png

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/kaggle/input/2020-russian-constitutional-referendum/ru_vote_2020.csv')
df['turnout'] = df['given']/np.maximum(1, df['nominal'])
df['support'] = df['yes']/np.maximum(1, df['given'])
df.head()

# Part 1

In [None]:
limits = [0.2, 0.4]
f = plt.figure(figsize=(10,10), facecolor='white')
gs = f.add_gridspec(6, 6)

ax = f.add_subplot(gs[0, :-1])
x = np.histogram(df['turnout'].values, bins=1000, range=(0,1))
ax.plot(x[1][:-2], x[0][:-1]); # Very last bin is full of 100% for tiny polling points
ax.axis('off')
plt.xlim(limits[0], 1)

ax = f.add_subplot(gs[1:, -1])
x = np.histogram(df['support'].values, bins=1000, range=(0,1))
ax.plot(x[0][:], x[1][:-1]); # Very last bin is full of 100% for tiny polling points
ax.axis('off')
plt.ylim(limits[1], 1)

ax = f.add_subplot(gs[1:, :-1])
ax.plot(df['turnout'], df['support'], '.', markersize=1, alpha=0.2)
plt.xlabel('Turnout')
plt.ylabel('Share of pro-Party votes')
plt.xlim(limits[0], 1)
plt.ylim(limits[1], 1)

In [None]:
def latinize(s):
    """Latinsca"""
    a1 = 'абвгдежзиклмнопрстуфхцчшщэюяьъый'
    a2 = "a b v g d e zh z i k l m n o p r s t u f h ts ch sh sch e yu ya ' ' y y"
    a1 = a1 + a1.upper()
    a2 = a2 + ' ' + a2.upper()
    a2 = a2.split(' ')
    t = {a1[i]: a2[i] for i in range(len(a1))}
    return ''.join([t[c] if c in t else c for c in s])

In [None]:
# Distributions by region
regions = np.unique(df['region'].values)
if False: # If we want to sort them by party support
    dfsum = df.groupby('region').agg({'turnout': [np.mean]})
    ind = np.argsort(dfsum.iloc[:,0].values) # Indices from lowest mean turnout to highest
    regions = regions[ind]

plt.figure(figsize=(20,22))
for i in range(len(regions)):
    region = regions[i]
    x = np.histogram(df['turnout'][df['region']==region].values, bins=200, range=(0,1))
    plt.subplot(10, 10, i+1)
    plt.plot(x[1][:-2], x[0][:-1]/np.max(x[0][:-1]))
    plt.xticks([])
    plt.yticks([])
    plt.xlim(0.1)
    plt.title(latinize(region))

> Above is the main plot (after Klimek 2012). As both turnout and support were faked, we see a corner-blob emerge from the natural blob in the middle. In RU case, it's also striated, as cheaters go for round percent values. Some by-region histograms below (see 85%, 90% & 95%)

> Also note the difference between cities that had active anti-Putin opposition plant themselves at voting stations & document everything (Moscow), and cities where it didn't quite happen (St. Petersburg).

In [None]:
regions = ['Москва', 'Санкт-Петербург']

category = 'support'

plt.figure(figsize=(6.5, 6), facecolor='white')
ax = plt.subplot(111)
for region_name in regions:
    df1 = df[:][df.region == region_name]        
    x = np.histogram(df1[category].values, bins=300, range=(0,1))
    plt.plot(x[1][:-1], x[0]/np.sum(x[0]), label=latinize(region_name))
    ax.set_xticks(np.arange(0,1.1,0.1))
    ax.set_xticks(np.arange(0,1,0.05), minor=True)
    plt.grid(which='minor', alpha=0.2)
    plt.grid(which='major', alpha=0.5)
    if category=='support':
        plt.xlabel('Pro-Party support')
    else:
        plt.xlabel('Turnout')
    plt.ylabel('Frequency')

plt.xlim(0.4, 0.99)
plt.legend(loc='upper left')

> Some other interesting regions include Tatarstan (a semi-independent republic in Central Russia speaking a Turkic language: what's the deal with this super-blot in the middle?), and Chechnya (well, you know this one! :)

In [None]:
# Selected regions only

regions = ['Татарстан', 'Чеченская']

limits = [0.2, 0.4]
f = plt.figure(figsize=(12,5.5), facecolor='white')

for i in range(len(regions)):
    plt.subplot(1,len(regions),i+1)
    region = regions[i]
    ind = (df.region == region)
    plt.plot(df.turnout, df.support, '.', markersize=1, color='gray', alpha=0.1)
    plt.plot(df[ind].turnout, df[ind].support, '.', markersize=1, color='blue', alpha=0.5)
    plt.xlabel('Turnout')
    plt.ylabel('Share of pro-Party votes')
    plt.xlim(limits[0], 1)
    plt.ylim(limits[1], 1)
    plt.title(latinize(region))

# Part 2

In [None]:
# An index of how peculiar this number is. Anti-peculiar numbers are slightly. Most peculiar are ==1 
def find_funny(df):
    df['funny_turnout'] = (np.floor(np.ceil(df['given'].values/df['nominal']*100)/100
                                    *df['nominal'])==df['given'])*1
    df['funny_support'] = (np.floor(np.ceil(df['yes'].values/df['given']*100)/100*
                                    df['given'])==df['yes'])*1
    return df

df = find_funny(df)
df.loc[np.logical_and(df['funny_support']==1, df['received']>200)]

In [None]:
# Estimate the prevalence of weird numbers
dfsum = df.groupby('region').agg({'funny_support': [np.mean], 
                                  'funny_turnout': [np.mean],
                                  'support': [np.mean]})
dfsum.columns = ['fun_support', 'fun_turnout', 'support']
dfsum['base_fun_support'] = 0
dfsum['base_fun_turnout'] = 0

nexp = 100
sigma = 0.005 # Noisification coefficient
for iexp in range(nexp):
    temp = df.copy()
    temp['given'] = np.round(temp['nominal']*(temp['turnout']+np.random.normal(size=len(df))*sigma))
    temp['given'] = np.maximum(0, np.minimum(temp['nominal'], temp['given'])).astype(int)
    temp['yes'] = np.round(temp['given']*(temp['turnout']+np.random.normal(size=len(df))*sigma))
    temp['yes'] = np.maximum(0, np.minimum(temp['given'], temp['yes'])).astype(int)
    temp = find_funny(temp)
    tempsum = temp.groupby('region').agg({'funny_support': [np.mean], 
                                          'funny_turnout': [np.mean]})
    dfsum['base_fun_turnout'] = dfsum['base_fun_turnout'] + tempsum[('funny_turnout','mean')]/nexp    
    dfsum['base_fun_support'] = dfsum['base_fun_support'] + tempsum[('funny_support','mean')]/nexp

plt.figure(figsize=(13,4))
plt.subplot(131)
plt.plot(dfsum['fun_turnout']-dfsum['base_fun_turnout'], dfsum['fun_support']-dfsum['base_fun_support'], '.')
plt.xlabel('Round %% in turnout')
plt.ylabel('Round %% in support')
plt.subplot(132)
plt.plot(dfsum['fun_turnout']-dfsum['base_fun_turnout'], dfsum['support'], '.')
plt.xlabel('Round %% in turnout')
plt.ylabel('Party support')
plt.subplot(133)
plt.plot(dfsum['fun_support']-dfsum['base_fun_support'], dfsum['support'], '.')
plt.xlabel('Round %% in support')
plt.ylabel('Party support')

> I also tried to find biggest cheaters by measuring the combness of histograms (compared the freq of %% slightly-above and slightly-below a whole % number), and the prevalence of "lucky" ballot counts that yield a round %. (H0: % were noisified with σ=0.005, n=50).

In [None]:
dfsum['weirdness'] = (dfsum.fun_support-dfsum.base_fun_support + dfsum.fun_turnout - dfsum.base_fun_turnout)/2

plt.figure(figsize=(6, 6), facecolor='white')
plt.plot(dfsum.weirdness, dfsum.support, '.');
plt.xlabel('Lucky numbers (both turnout and support) that yield a round %% number');
plt.ylabel('Pro-party support');
label_these = ['Тыва', 'Чеченская', 'Башкортостан', 'Краснодарский', 'Татарстан', 
               'Дагестан', 'Крым', 'Москва', 'Липецкая', 'Ставропольский']
plt.vlines(0, 0, 1, alpha=0.4)
plt.ylim(0.4, 1)
for i in range(len(dfsum)):
    if dfsum.index.tolist()[i] in label_these:
        plt.text(dfsum.weirdness[i], dfsum.support[i], 
                  latinize(dfsum.index.tolist()[i]))

# Part 3

In [None]:
# One region of interest

df1 = df[:][df.region == 'Краснодарский']
plt.figure(figsize=(12, 6))
ax = plt.subplot(111)
x = np.histogram(df1['turnout'].values, bins=300, range=(0,1))
plt.plot(x[1][:-1], x[0])
ax.set_xticks(np.arange(0,1.1,0.1))
ax.set_xticks(np.arange(0,1,0.05), minor=True)
plt.grid(which='minor', alpha=0.5)
plt.grid(which='major', alpha=1)

In [None]:
# Noisificator from a diff notebook, not yet functional

def noisify(df1):
    nexp = 1
    sigma = 0.005 # Noisification coefficient
    temp = df1.copy()
    temp['given'] = np.round(temp['nominal']*(temp['turnout']+np.random.normal(size=len(temp))*sigma))
    temp['given'] = np.maximum(0, np.minimum(temp['nominal'], temp['given'])).astype(int)
    temp['yes'] = np.round(temp['given']*(temp['turnout']+np.random.normal(size=len(temp))*sigma))
    temp['yes'] = np.maximum(0, np.minimum(temp['given'], temp['yes'])).astype(int)
    temp['turnout'] = temp.given / temp.nominal
    temp['support'] = temp.yes / temp.given
    return temp

plt.figure(figsize=(12, 6))
x = np.histogram(noisify(df1)['turnout'].values, bins=300, range=(0,1))
plt.plot(x[1][:-1], x[0])

In [None]:
def measure_comb(vals):
    """For values between 0 and 1, measure how comb-like they are."""
    x = np.histogram(vals, bins=np.arange(0, 1.005, 0.005)-0.002, range=(0,1))
    # This small offset of 0.002 to account for rounding errors around 1%-level values
    return np.sum(x[0][np.arange(50,100)*2])/np.sum(x[0][np.arange(50,100)*2 + 1])

In [None]:
ds = df.groupby('region').agg({'turnout': np.mean, 'support': np.mean})
ds.columns = ['turnout', 'support']

regions = np.unique(df.region)
for i in range(len(regions)):
    region = regions[i]
    temp = df[df.region==region]
    ds.loc[region, 'turn_comb'] = (measure_comb(temp.turnout) / 
                                   np.mean([measure_comb(noisify(temp).turnout) for i in range(50)]))
    ds.loc[region, 'supp_comb'] = (measure_comb(temp.support) / 
                                   np.mean([measure_comb(noisify(temp).support) for i in range(50)]))
    print('.', end='')

In [None]:
plt.figure(figsize=(10, 10))
plt.plot(ds.turn_comb, ds.supp_comb, '.')
plt.grid(alpha = 0.5)
for i in range(len(ds)):
    plt.text(ds.turn_comb.values[i], ds.supp_comb.values[i], latinize(ds.index.tolist()[i]))
plt.xlabel('Turnout combness')
plt.ylabel('Support combness')

# Part 4

In [None]:
def digihist(s):
    """Histogram of digits"""
    h = {i:0 for i in range(10)}
    for i in s[:]:
        if i in '1234567890':
            h[int(i)] += 1
    return h
    
def plot_dict(h):
    """Now plot it"""
    x,y = zip(*[(key,val) for key,val in h.items()])
    y = np.array(y)/np.sum(y)
    plt.plot(x[1:], y[1:], '.-')
    
plt.figure(figsize=(8,6))
plot_dict(digihist(''.join([str(v) for v in df.nominal.values])))
plot_dict(digihist(''.join([str(v) for v in df.given.values])))
plot_dict(digihist(''.join([str(v) for v in df.yes.values])))
plt.plot(np.arange(1,10), np.log(1+1/np.arange(1,10))/np.log(10));
plt.xlabel('Digit')
plt.ylabel('Frequency')
plt.legend(labels=['Voting point size', 'Turnout', 'Support', 'Ideal'])

# Part 5

> Not surprisingly, the higher Putin-support, the closer the %% are to a nice round %. (Or rather: the closer it is to the roundest % one can get by dividing two integers, for a given voter list. Say, for 1111 voters, you can't get exactly 95%, but you can 1055/1111=0.9496)

In [None]:
ind = (df.nominal > 1000) # Only large polling stations
df1 = df.copy()[ind]
#df1 = noisify(df)[ind]

df1['closest_achievable_turnout'] = np.round(df1.given/df1.nominal*100)/100*df1.nominal
plt.figure(figsize=(12,5.5), facecolor='white')
plt.subplot(121)
plt.plot(df1.given-df1.closest_achievable_turnout, df1.support, '.', markersize=1, alpha=0.1)
plt.ylim(0.4, 1)
plt.xlim(-15, 15)
plt.xlabel('Turnout: distance to nearest round %')
plt.ylabel('Pro-party vote')
plt.subplot(122)
x = np.histogram((df1.given-df1.closest_achievable_turnout)/df1.given*100, bins=100)
plt.plot(x[1][:-1], x[0]/np.sum(x[0]))
plt.grid()
plt.xlabel('Turnout: Distance (in %) to nearest round %')
plt.ylabel('Frequency')

In [None]:
def measure_comb(vals):
    """For values between 0 and 1, measure how comb-like they are."""
    x = np.histogram(vals, bins=np.arange(0, 1.005, 0.005)-0.002, range=(0,1))
    # This small offset of 0.002 to account for rounding errors around 1%-level values
    return np.sum(x[0][np.arange(50,100)*2])/max(1,np.sum(x[0][np.arange(50,100)*2 + 1]))

df['size_percentile'] = np.argsort(np.argsort(df.nominal))/len(df)
df['yes_turnout'] = df.yes / np.maximum(1, df.nominal)

combiness = np.zeros((10,10))
for i in range(10):
    for j in range(10):
        ind = (1*(df.size_percentile>i/10)*(df.size_percentile<=(i+1)/10)*
                 (df.yes_turnout>j/10)*(df.yes_turnout<=(j+1)/10) == 1)
        if np.sum(ind) < 1:
            combiness[i,j] = 0
        else:
            combiness[i,j] = (measure_comb(df[ind].support)+measure_comb(df[ind].turnout))/2

#print((combiness*10).astype(int))
plt.imshow(np.rot90(combiness))
plt.colorbar()
plt.xlabel('Size percentile')
plt.ylabel('Pro-party turnout')
plt.title('Combness')

> Author: https://github.com/khakhalin/Sketches/tree/master/ru_vote_2020