In [12]:
# Import required libraries
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split, cross_validate, LeaveOneOut
from surprise import KNNBasic, KNNWithMeans,SVD
from surprise import accuracy

import sklearn.metrics as metrics
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

import pandas as pd

In [2]:
# Function to focus on a political party and create a dataframe with the agreement score of each deputy
def voteType(df):
    
    # Count the number of times each voting orientation occurred
    voting = df["new"].value_counts()
  
    ## Calculate the percentage of deputies who vote according to the party sum of all times 
    ## they agree divided by the number of times they voted => percentage
    percent = (df.groupby('deputado_nome')['new'].sum())/df['deputado_nome'].value_counts()
    pdAcord = pd.DataFrame({'deputado_nome':df['deputado_nome'].unique().tolist(),'score_accordingly': round(percent*100,2)})
    
    # Merge the calculated agreement score with the original dataframe
    dfPL = pd.merge(df,pdAcord, on ='deputado_nome')
    
    # Create encoder and label encoder objects
    encoder = OneHotEncoder() 
    label_encoder = LabelEncoder()
    
    # Encode the deputy names, birthplace, and voting orientation
    encoded_deputados = label_encoder.fit_transform(dfPL['deputado_nome'])
    encoded_gender = label_encoder.fit_transform(dfPL['siglaSexo'])
    encoded_municipios = label_encoder.fit_transform(dfPL['municipioNascimento'])
    encoded_orientacao = label_encoder.fit_transform(dfPL['orientacao'])
    encoded_vote = label_encoder.fit_transform(dfPL['voto'])

    
    
    # Add the encoded values to the dataframe
    dfPL['deputado_enc'] = encoded_deputados
    dfPL['orientacao_enc'] = encoded_orientacao
    dfPL['gender_enc'] = encoded_gender
    dfPL['vote_enc'] = encoded_vote
    
    
    return dfPL

In [3]:
##calculates and returns the RMSE and MAE
def evaluateRM(data, algo, n_folds):
    #define cross-validation method to use
    cv = LeaveOneOut()
    
    # Split the data into n_folds for cross-validation
    cross_validate_results = cross_validate(algo, data, measures=['RMSE', 'MAE'],  cv=cv)
    
    # Extract the mean RMSE and MAE values from the cross-validation results
    rmse = cross_validate_results['test_rmse'].mean()
    mae = cross_validate_results['test_mae'].mean()
    
    print("RMSE: ", rmse)
    print("MAE: ", mae)
    
    return rmse, mae

In [4]:
# Load the data into a pandas dataframe
df = pd.read_csv('..\data\preprocessed\dataVoting.csv')
df

Unnamed: 0.1,Unnamed: 0,idVotacao,siglaOrgao,idEvento,aprovacao,uriVotacao,voto,deputado_id,deputado_nome,deputado_siglaPartido,...,idLegislaturaFinal,siglaSexo,municipioNascimento,idade,new,score_accordingly,deputado_enc,orientacao_enc,gender_enc,vote_enc
0,0,2312874-9,PLEN,64653,1.0,https://dadosabertos.camara.leg.br/api/v2/vota...,Sim,204554,Abílio Santana,PL,...,56,M,Salvador,58.0,0,72.56,2,1,1,4
1,1,46249-294,PLEN,64653,0.0,https://dadosabertos.camara.leg.br/api/v2/vota...,Não,204554,Abílio Santana,PL,...,56,M,Salvador,58.0,1,72.56,2,1,1,2
2,2,46249-297,PLEN,64653,1.0,https://dadosabertos.camara.leg.br/api/v2/vota...,Sim,204554,Abílio Santana,PL,...,56,M,Salvador,58.0,1,72.56,2,3,1,4
3,3,46249-312,PLEN,64653,2.0,https://dadosabertos.camara.leg.br/api/v2/vota...,Sim,204554,Abílio Santana,PL,...,56,M,Salvador,58.0,0,72.56,2,1,1,4
4,4,46249-316,PLEN,64653,0.0,https://dadosabertos.camara.leg.br/api/v2/vota...,Não,204554,Abílio Santana,PL,...,56,M,Salvador,58.0,1,72.56,2,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112151,112151,2323617-49,PLEN,66394,2.0,https://dadosabertos.camara.leg.br/api/v2/vota...,Sim,122466,Ronaldo Martins,REPUBLICANOS,...,56,M,São Paulo,45.0,0,68.00,317,0,1,4
112152,112152,2323617-56,PLEN,66394,0.0,https://dadosabertos.camara.leg.br/api/v2/vota...,Não,122466,Ronaldo Martins,REPUBLICANOS,...,56,M,São Paulo,45.0,1,68.00,317,1,1,2
112153,112153,2314962-33,PLEN,66455,1.0,https://dadosabertos.camara.leg.br/api/v2/vota...,Sim,122466,Ronaldo Martins,REPUBLICANOS,...,56,M,São Paulo,45.0,1,68.00,317,3,1,4
112154,112154,2326278-38,PLEN,66439,0.0,https://dadosabertos.camara.leg.br/api/v2/vota...,Não,205535,Patricia Ferraz,PODE,...,56,F,Belo Horizonte,43.0,1,40.76,277,1,0,2


In [5]:
dfP = voteType(df)

In [6]:
# Create the reader
reader = Reader(rating_scale=(0, 3))
# Load the data into a dataset
data = Dataset.load_from_df(dfP[['deputado_enc', 'idVotacao', 'vote_enc']], reader)
# Split the data into a training set and a test set
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

In [7]:
# Train a KNN model
model_knn = KNNBasic(k=40, min_k=1, sim_options={'name': 'pearson_baseline', 'user_based': False})
# model_knn.fit(trainset)

In [8]:
evaluateRM(data, model_knn, 40)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE:  0.8474083952891291
MAE:  0.6984456017184948


(0.8474083952891291, 0.6984456017184948)

In [9]:
# Train a SVD model
model_svd = SVD(n_factors=50, n_epochs=20, lr_all=0.005, reg_all=0.4)
model_svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b5443855b0>

In [10]:
evaluateRM(data, model_svd, 50)

RMSE:  0.8790145947391739
MAE:  0.8379117462128589


(0.8790145947391739, 0.8379117462128589)

In [11]:
# Use the trained model to make predictions
deputy_id = 277
vote_id = '2314962-33'

prediction = model_knn.predict(deputy_id, vote_id, verbose=True)

user: 277        item: 2314962-33 r_ui = None   est = 3.00   {'actual_k': 1, 'was_impossible': False}


#### With orientation, gender, age and city 

In [None]:
# Convert the data into a Surprise-compatible dataset format
reader = Reader(rating_scale=(0, 3))
data_spec = Dataset.load_from_df(dfP[['deputado_enc', 'idVotacao','vote_enc', 'orientacao_enc','gender_enc','idade']] reader)

In [None]:
# Split the data into training and test sets
trainset_spec = data_spec.build_full_trainset()
testset_spec = trainset_spec.build_anti_testset()

In [None]:
# Train the KNNWithMeans algorithm on the training set
algo = KNNWithMeans(k=50, sim_options={'user_based': False})
algo.fit(trainset_spec)

In [None]:
# Make predictions on the test set
predictions_spec = algo.test(testset_spec)

In [None]:
evaluateRM(data, algo, 50)