In [7]:
# Import required libraries
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split, cross_validate, LeaveOneOut
from surprise import KNNBasic, SVD
from surprise import accuracy

import sklearn.metrics as metrics
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

import pandas as pd

In [2]:
# Function to focus on a political party and create a dataframe with the agreement score of each deputy
def voteType(df):
    
    # Count the number of times each voting orientation occurred
    voting = df["new"].value_counts()
  
    ## Calculate the percentage of deputies who vote according to the party sum of all times 
    ## they agree divided by the number of times they voted => percentage
    percent = (df.groupby('deputado_nome')['new'].sum())/df['deputado_nome'].value_counts()
    pdAcord = pd.DataFrame({'deputado_nome':df['deputado_nome'].unique().tolist(),'score_accordingly': round(percent*100,2)})
    
    # Merge the calculated agreement score with the original dataframe
    dfPL = pd.merge(df,pdAcord, on ='deputado_nome')
    
    # Create encoder and label encoder objects
    encoder = OneHotEncoder() 
    label_encoder = LabelEncoder()
    
    # Encode the deputy names, birthplace, and voting orientation
    encoded_deputados = label_encoder.fit_transform(dfPL['deputado_nome'])
    encoded_gender = label_encoder.fit_transform(dfPL['siglaSexo'])
    encoded_municipios = label_encoder.fit_transform(dfPL['municipioNascimento'])
    encoded_orientacao = label_encoder.fit_transform(dfPL['orientacao'])
    encoded_vote = label_encoder.fit_transform(dfPL['voto'])

    
    
    # Add the encoded values to the dataframe
    dfPL['deputado_enc'] = encoded_deputados
    dfPL['orientacao_enc'] = encoded_orientacao
    dfPL['gender_enc'] = encoded_gender
    dfPL['vote_enc'] = encoded_vote
    
    
    return dfPL

In [8]:
##calculates and returns the RMSE and MAE
def evaluateRM(data, algo, n_folds):
    #define cross-validation method to use
    cv = LeaveOneOut()
    
    # Split the data into n_folds for cross-validation
    cross_validate_results = cross_validate(algo, data, measures=['RMSE', 'MAE'],  cv=cv)
    
    # Extract the mean RMSE and MAE values from the cross-validation results
    rmse = cross_validate_results['test_rmse'].mean()
    mae = cross_validate_results['test_mae'].mean()
    
    print("RMSE: ", rmse)
    print("MAE: ", mae)
    
    return rmse, mae

In [4]:
# Load the data into a pandas dataframe
df = pd.read_csv('dataVoting.csv')
df

Unnamed: 0.1,Unnamed: 0,idVotacao,siglaOrgao,idEvento,aprovacao,uriVotacao,voto,deputado_nome,deputado_siglaPartido,orientacao,idLegislaturaInicial,idLegislaturaFinal,siglaSexo,municipioNascimento,idade,new
0,2,2312874-9,PLEN,64653,1.0,https://dadosabertos.camara.leg.br/api/v2/vota...,Sim,Abílio Santana,PL,Não,56,56,M,Salvador,57.0,0
1,3,2312874-9,PLEN,64653,1.0,https://dadosabertos.camara.leg.br/api/v2/vota...,Sim,Abílio Santana,PL,Liberado,56,56,M,Salvador,57.0,0
2,4,2312874-9,PLEN,64653,1.0,https://dadosabertos.camara.leg.br/api/v2/vota...,Sim,Abílio Santana,PL,Sim,56,56,M,Salvador,57.0,1
3,7,2312874-9,PLEN,64653,1.0,https://dadosabertos.camara.leg.br/api/v2/vota...,Sim,Abílio Santana,PL,Obstrução,56,56,M,Salvador,57.0,0
4,8,2312874-9,PLEN,64653,1.0,https://dadosabertos.camara.leg.br/api/v2/vota...,Sim,Abílio Santana,PL,Sim,56,56,M,Salvador,57.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2499418,3006451,2314962-33,PLEN,66455,1.0,https://dadosabertos.camara.leg.br/api/v2/vota...,Sim,Patricia Ferraz,PODE,Sim,56,56,F,Belo Horizonte,43.0,1
2499419,3006452,2314962-33,PLEN,66455,1.0,https://dadosabertos.camara.leg.br/api/v2/vota...,Sim,Patricia Ferraz,PODE,Sim,56,56,F,Belo Horizonte,43.0,1
2499420,3006453,2314962-33,PLEN,66455,1.0,https://dadosabertos.camara.leg.br/api/v2/vota...,Sim,Patricia Ferraz,PODE,Sim,56,56,F,Belo Horizonte,43.0,1
2499421,3006454,2314962-33,PLEN,66455,1.0,https://dadosabertos.camara.leg.br/api/v2/vota...,Sim,Patricia Ferraz,PODE,Sim,56,56,F,Belo Horizonte,43.0,1


In [5]:
dfP = voteType(df)

In [6]:
# Create the reader
reader = Reader(rating_scale=(0, 3))
# Load the data into a dataset
data = Dataset.load_from_df(dfP[['deputado_enc', 'idVotacao', 'vote_enc']], reader)
# Split the data into a training set and a test set
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

In [10]:
# Train a KNN model
model_knn = KNNBasic(k=40, min_k=1, sim_options={'name': 'pearson_baseline', 'user_based': False})
# model_knn.fit(trainset)

In [None]:
evaluateRM(data, model_knn, 40)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


In [None]:
# Train a SVD model
model_svd = SVD(n_factors=50, n_epochs=20, lr_all=0.005, reg_all=0.4)
model_svd.fit(trainset)

In [None]:
evaluateRM(data, model_svd, 50)

In [None]:
# Use the trained model to make predictions
deputy_id = 277
vote_id = '2314962-33'

prediction = algo.predict(deputy_id, vote_id, verbose=True)

#### With orientation, gender, age and city 

In [None]:
# Convert the data into a Surprise-compatible dataset format
reader = Reader(rating_scale=(0, 3))
data_spec = Dataset.load_from_df(dfP[['deputado_enc', 'idVotacao','vote_enc', 'orientacao_enc','gender_enc','idade']] reader)

In [None]:
# Split the data into training and test sets
trainset_spec = data_spec.build_full_trainset()
testset_spec = trainset_spec.build_anti_testset()

In [None]:
# Train the KNNWithMeans algorithm on the training set
algo = KNNWithMeans(k=50, sim_options={'user_based': False})
algo.fit(trainset_spec)

In [None]:
# Make predictions on the test set
predictions_spec = algo.test(testset_spec)

In [None]:
evaluateRM(data, algo, 50)