In [1]:
from google.colab import files
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import os 
import pandas as pd 
import numpy as np
import copy
import re
import string
import seaborn as sns
path = 'gdrive/MyDrive/2020_21_2/Temalabor_1/processed_popdata/'

# Function Def

In [54]:
# Function1.: Trimming  all leading and trailing whitespaces, apply str to the whole dfs

def trim_all_columns(df):

    trim_strings = lambda x: x.strip() if isinstance(x, str) else x
    
    df = df.applymap(trim_strings)
    df = df.applymap(str) #converting the whole df datatype into string
    return df


In [55]:
def clustermaker(num, df, clusterCutoffValsLst):

  if num != len(clusterCutoffValsLst): 
    print('Please provide the right number of cluster cutotoff values as your first argument')
    return 'Please provide the right number of cluster cutotoff values as your first argument'

  if clusterCutoffValsLst[-1] != max(df.value):
    print("Your election year table's maximum population is not inline with the max cluster cutoff values that you added")
    return "Your election year table's maximum population is not inline with the max cluster cutoff values that you added"

  clusterCutoffValsLst = sorted(clusterCutoffValsLst)
  clusterCutoffValsLst = np.array(clusterCutoffValsLst).astype(str)
  
  clusterDict = dict.fromkeys(clusterCutoffValsLst)

  tmpDf = copy.deepcopy(df)

  for key in clusterCutoffValsLst:
    clusterDict[key] = tmpDf[ tmpDf.value <= int(key)]
    tmpDf.drop(clusterDict[key].index, inplace = True)
    tmpDf.reset_index(drop = True, inplace = True)

  
  return clusterDict, clusterCutoffValsLst

In [56]:
def solo_or_multiparty(elecDict, year, common_list_dict, partylist): 
  
    parties = copy.deepcopy(partylist)

    if len(parties) == 1:
      solo_party_df = elecDict[year][elecDict[year]['partyname'] == parties[0]]
      if len(solo_party_df) == 0:
        return []  

      else: 
        final_parties_df = copy.deepcopy(solo_party_df)
     
    else: 
      multiparty_df = None 

      if common_list_dict[year] != 0:
        set1 = set(parties)
        for common_list in common_list_dict[year]:  
          set2 = set(common_list)

          intersected = set1.intersection(set2)
          if len(intersected) == 0: 
            continue
          
          diff = set1 - intersected
          diff.add(tuple(intersected)[0])
          parties = tuple(diff)



      for party in parties: 
        selected_party_df = elecDict[year][elecDict[year]['partyname'] == party]
        multiparty_df = pd.concat([multiparty_df, selected_party_df])

      if len(multiparty_df) == 0: 
        return [] 

      final_parties_df = copy.deepcopy(multiparty_df)

    return final_parties_df
  


In [57]:
def filtering_cols_modifier(df):
  df = df[['settlement', 'settlementid', 'datatypeid', 'value', 'sumvote', 'vote_party', 'vote_rate_party']]
  df.sort_values(by=['value', 'settlement'], inplace = True)

  groupby_vote_rate_party_df = df.groupby(list(df.columns[:-2])).sum().reset_index()
  groupby_vote_rate_party_df.sort_values(by=['value', 'settlement'], inplace = True)

  return groupby_vote_rate_party_df
  


In [71]:
def Corr_VoteRate_gen(num, elecDict, elecYearKeyLst, clusterCutoffValsLst, common_list_dict, partylist, isTruncated = False): #Fo fuggveny 

  print(partylist)

  if num != len(clusterCutoffValsLst): 
    print('Please provide the right number of cluster cutotoff values as your first argument')
    return 'Please provide the right number of cluster cutotoff values as your first argument'

  if isTruncated == False:
    VoteRate_columns = ['Year'] + clusterCutoffValsLst + ['Max']

  else: 
    VoteRate_columns = ['Year'] + clusterCutoffValsLst[:-1] + ['Max']

  corrDf_columns = ['Year'] + ['Corr']

  VoteRateDf = pd.DataFrame(columns = VoteRate_columns)
  corrDf = pd.DataFrame(columns = corrDf_columns)
  clusterCutoffValsLst.sort()

  for year in elecYearKeyLst:
    final_parties_df = solo_or_multiparty(elecDict, year, common_list_dict, partylist)

    if len(final_parties_df) == 0: 
      continue

    final_parties_df = filtering_cols_modifier(final_parties_df)

    if isTruncated == False:
      maxCutOff = max(final_parties_df.value)
      print(year, '----', 'Max end interval:', maxCutOff)
      clusterCutoffValsLst_w_maxCutOff = clusterCutoffValsLst + [maxCutOff]
      num_app = num + 1

    else:
      num_app = num 
      final_parties_df = final_parties_df[final_parties_df.value <= clusterCutoffValsLst[-1]]
      maxCutOff = max(final_parties_df.value)
      print(year, '----', 'Max end interval:', maxCutOff)
      clusterCutoffValsLst[-1] = maxCutOff 
      clusterCutoffValsLst.sort()
      clusterCutoffValsLst_w_maxCutOff = clusterCutoffValsLst


    clusterDict, clusterCutoffValsLst_w_maxCutOff = clustermaker(num_app, final_parties_df, clusterCutoffValsLst_w_maxCutOff)

    rowListVoteRate = [year]

    for cutoff in clusterCutoffValsLst_w_maxCutOff: 
      temp_df = clusterDict[cutoff]
      try:
        voteRate = ( sum(temp_df['vote_party']) / (sum(temp_df['sumvote'])) ) * 100
      except ZeroDivisionError: 
        voteRate = 0 
      rowListVoteRate.append(round(voteRate,2))

    rowVoteRate = pd.DataFrame(np.array([rowListVoteRate]), columns = VoteRate_columns)
    VoteRateDf = pd.concat([VoteRateDf, rowVoteRate])

    rowCorrDf_list = [year]
    corr_for_that_year = (final_parties_df['vote_rate_party'] * 100).corr(np.log10(final_parties_df['value']))
    rowCorrDf_list.append(round(corr_for_that_year,4))
    rowCorr = pd.DataFrame(np.array([rowCorrDf_list]), columns = corrDf_columns)
    corrDf = pd.concat([corrDf, rowCorr])


  VoteRateDf = VoteRateDf.reset_index(drop = True)
  corrDf = corrDf.reset_index(drop = True)

  return VoteRateDf, corrDf


# Main

In [59]:
common_list_dict = {'1990': [],
                    '1994': [],
                    '1998': [],
                    '2002': [('Fidesz', 'MDF')],
                    '2006': [('Fidesz', 'KDNP'), ('Jobbik', 'MIÉP')],
                    '2010': [('Fidesz', 'KDNP')],
                    '2014': [('Fidesz', 'KDNP'), ('MSZP', 'DK', 'Együtt', 'PM', 'MLP')],
                    '2018': [('Fidesz', 'KDNP'), ('MSZP', 'PM')]
                    }


In [60]:
electionVotes = pd.read_csv(path + 'electionListVotes_Single.csv', dtype=str)
electionVotes = trim_all_columns(electionVotes)

In [61]:
elecYearKeyLst = [str(year) for year in range(1990,2022,4)]

elecYearDict = dict.fromkeys(elecYearKeyLst)
elecYearDict_Bp = dict.fromkeys(elecYearKeyLst)
elecYearDict_BpDistrict = dict.fromkeys(elecYearKeyLst)


In [62]:
elecYearDict_Bp

{'1990': None,
 '1994': None,
 '1998': None,
 '2002': None,
 '2006': None,
 '2010': None,
 '2014': None,
 '2018': None}

In [63]:
numVarInt = ['value', 'votePop_elig', 'sumvote', 'vote_NOTparty', 'vote_party'] 
numVarFloat = ['vote_rate_NOTparty', 'vote_rate_party', 'vote_rate_NOTparty_per_party']

for key in elecYearDict:
  elecYearDict[key] = electionVotes[electionVotes.year == key].reset_index(drop = True)  # All Data 

  elecYearDict_BpDistrict[key] = elecYearDict[key][ ~elecYearDict[key].settlement.str.fullmatch('Budapest') ].reset_index(drop = True)  # Only Bp District Data
  elecYearDict_Bp[key] = elecYearDict[key][ ~elecYearDict[key].settlement.str.contains('kerület\Z') ].reset_index(drop = True)  # Without Bp District Data 

  for var in numVarInt:  #Converting str to int 
    elecYearDict[key][var] = elecYearDict[key][var].astype(np.int64)

    elecYearDict_BpDistrict[key][var] = elecYearDict_BpDistrict[key][var].astype(np.int64)
    elecYearDict_Bp[key][var] = elecYearDict_Bp[key][var].astype(np.int64)

  for var in numVarFloat: #Converting str to float 
    elecYearDict[key][var] = elecYearDict[key][var].astype(np.float64)

    elecYearDict_BpDistrict[key][var] = elecYearDict_BpDistrict[key][var].astype(np.float64)
    elecYearDict_Bp[key][var] = elecYearDict_Bp[key][var].astype(np.float64)


  elecYearDict[key] =  elecYearDict[key].sort_values(by = ['value']).reset_index(drop = True)

  elecYearDict_BpDistrict[key]= elecYearDict_BpDistrict[key].sort_values(by = ['value']).reset_index(drop = True)  #Sort by population number
  elecYearDict_Bp[key] = elecYearDict_Bp[key].sort_values(by = ['value']).reset_index(drop = True)                 #Sort by population number

# Budapest-wide Data w/o Districts 

# Only Budapest District Data --- elecYearDict_BpDistrict dictionary 

In [76]:
clusterCutoffValsLst = [100, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000]
#clusterCutoffValsLst = [100, 500, 1000, 2000]
#isTruncated = True
VoteRateDf, corrDf = Corr_VoteRate_gen(len(clusterCutoffValsLst), elecYearDict_Bp, ['1998', '2002'], clusterCutoffValsLst, common_list_dict, ['Fidesz', 'FKGP', 'MDF'])

['Fidesz', 'FKGP', 'MDF']
1998 ---- Max end interval: 1801483
2002 ---- Max end interval: 1726872


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [77]:
VoteRateDf

Unnamed: 0,Year,100,500,1000,2000,5000,10000,20000,50000,100000,Max
0,1998,57.65,51.79,50.8,49.41,48.97,48.32,46.21,44.61,43.73,41.43
1,2002,59.59,54.9,51.92,49.06,46.83,44.41,42.93,40.29,39.25,35.29


In [78]:
corrDf

Unnamed: 0,Year,Corr
0,1998,-0.1723
1,2002,-0.3238


In [79]:
VoteRateDf, corrDf = Corr_VoteRate_gen(len(clusterCutoffValsLst), elecYearDict_Bp, ['1998', '2002'], clusterCutoffValsLst, common_list_dict, ['FKGP'])
corrDf

['FKGP']
1998 ---- Max end interval: 1801483
2002 ---- Max end interval: 1726872


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Year,Corr
0,1998,-0.3075
1,2002,-0.1139


In [None]:
#clusterCutoffValsLst = [100, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000]
clusterCutoffValsLst = [100, 500, 1000, 2000]
isTruncated = True
VoteRateDf, corrDf = Corr_VoteRate_gen(len(clusterCutoffValsLst), elecYearDict_BpDistrict, ['2018'], clusterCutoffValsLst, common_list_dict, ['Fidesz', 'KDNP'], isTruncated)

['Fidesz', 'KDNP']
2018 ---- Max end interval: 1991


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
VoteRateDf

Unnamed: 0,Year,100,500,1000,Max
0,2018,61.89,61.49,58.45,56.61


In [None]:
#elecYearKeyLst --- Ha az osszes valasztasi ev kell 
clusterCutoffValsLst = [100, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000]
VoteRateDf, corrDf = Corr_VoteRate_gen(len(clusterCutoffValsLst), elecYearDict_BpDistrict, ['2014', '2018'], clusterCutoffValsLst, common_list_dict, ['MSZP', 'DK'])

['MSZP', 'DK']
2014 ---- Max end interval: 204867
2018 ---- Max end interval: 202402


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


# Testing Functions

### Szakadat Klaszter

In [None]:
clusterCutoffValsLst = [1000, 2000, 4000, 10000, 20000, 40000, 100000, 400000]
