In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import iplot
import plotly.figure_factory as ff

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OrdinalEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.metrics import classification_report

# Take a Quick Look at the Data Structure

In [None]:
df = pd.read_csv('../input/congressional-voting-records/house-votes-84.csv')
df.head()

In [None]:
class Info:
    def __init__(self,dataframe):
        self.dataframe = dataframe.drop(['Class Name'],axis=1)
        self.columns = list(self.dataframe.columns)
        self.df = pd.DataFrame()
        self.listY = []
        self.listN = []
        self.listQ = []
        self.main()
        
    def countY(self,x='y'):
        for col in self.columns:
            self.listY.append((self.dataframe[col] == x).sum())
        self.df[x] = self.listY
        
    def countN(self,x='n'):
        for col in self.columns:
            self.listN.append((self.dataframe[col] == x).sum())
        self.df[x] = self.listN
        
    def countQ(self,x='?'):
        for col in self.columns:
            self.listQ.append((self.dataframe[col] == x).sum())
        self.df[x] = self.listQ

    
    def missing_zero_values_table(self):
        mis_val = self.dataframe.isnull().sum()
        mis_val_percent = round(self.dataframe.isnull().mean().mul(100),2)
        mz_table = pd.concat([mis_val,mis_val_percent],axis=1)
        mz_table = mz_table.rename(columns = {self.dataframe.index.name:'col_name',0:'Missing Values',1:'% of Total Values'})
        mz_table['Data_type'] = self.dataframe.dtypes
        print("Your selected dataframe has ** "+str(df.shape[1])+" ** columns and ** "+str(df.shape[0])+" ** Rows.\n"
                 "There are ** "+str(mz_table[mz_table.iloc[:,1] != 0].shape[0])+
                  " ** columns that have missing values. \n")
        self.df = pd.concat([self.df,mz_table],axis=1)
        
    def main(self):
        self.df['col'] = self.columns
        self.countN()
        self.countY()
        self.countQ()
        self.df.set_index('col',inplace=True)
        self.missing_zero_values_table()
        print('Sum No : ',sum(self.listN),
              ', Sum Yes : ',sum(self.listY),
              ', Sum ? : ',sum(self.listQ),
             ', Total : ',sum(self.listN)+sum(self.listY)+sum(self.listQ))

In [None]:
a = Info(df)
a.df.style.background_gradient(cmap='Reds')

In [None]:
px.bar(a.df,x=a.df.index, y=['n','y','?'], title="Votes related to laws")

# Create Test set

In [None]:
x = df['Class Name'].value_counts()
labels = df['Class Name'].unique().tolist()

In [None]:
className = go.Pie(labels=labels,values=x,hoverinfo='label+percent',textinfo='value')
iplot([className])

The test set is must be **representative** of the overall parties population 

In [None]:
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index,test_index in split.split(df,df['Class Name']):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

In [None]:
strat_test_set['Class Name'].value_counts() / len(strat_test_set)

good job

# Discover and visualize the data to gain insights

In [None]:
resVoting = {}
for col in df.columns.tolist()[1:]:
    resVoting[col] = df[col].value_counts().idxmax()

In [None]:
resVoting = pd.DataFrame(list(resVoting.items()),columns = ['columns','FinalRes'])
resVoting

In [None]:
maximumVote = {}
for col in df.columns.tolist()[1:]:
    maximumVote[col] = df.groupby([col,'Class Name']).count().idxmax()[0]

In [None]:
maximumVote = pd.DataFrame(list(maximumVote.items()),columns = ['columns','TupleVotes'])
maximumVote

In [None]:
Res,className=zip(*np.array(maximumVote.TupleVotes))

In [None]:
maximumVote['Res'] = Res
maximumVote['className'] = className
maximumVote.drop(['TupleVotes'],axis=1,inplace=True)

In [None]:
resVoting = resVoting.merge(maximumVote,on='columns',how='left')

In [None]:
winnerParty = resVoting[resVoting.FinalRes == resVoting.Res]['className'].value_counts()
px.bar(x=winnerParty.index, y=winnerParty,
       title='The number of victories in passing a law in different parties',
       labels={'y':'Number of successes '})

It seems democrat party is strong in these voting

In [None]:
labels = resVoting.FinalRes.value_counts().index
x = resVoting.FinalRes.value_counts()
ResultVoting = go.Pie(labels=labels,values=x,hoverinfo='label+percent',
                      textinfo='value',title='Voting results')
iplot([ResultVoting])

Most of them were approved

# Prepare the Data for Machine Learning Algorithms

In [None]:
voting = strat_train_set.drop('Class Name',axis=1)
voting_labels = strat_train_set['Class Name'].copy()

**Handling Categorical Attributes**

In [None]:
ordinal_encoder = OrdinalEncoder()
voting_cat_encoded = ordinal_encoder.fit_transform(voting)
voting_cat_encoded

# Select and Train a Model

In [None]:
clf = GaussianNB()
clf.fit(voting_cat_encoded,voting_labels)

In [None]:
pred = clf.predict(voting_cat_encoded)
accuracy_score(pred,voting_labels)*100

In [None]:
px.imshow(confusion_matrix(voting_labels, pred),
         labels=dict(y="True Label", x="Predicted Label", color="Productivity"),
                x=['democrat', 'republican'],
                y=['democrat', 'republican'])

It's not overfit or underfit  
ok fine

I know there are great algorithms such as SVM, etc. but I like to implement it with Gaussian Naive Bayes

# Evaluate our Model on the Test Set

In [None]:
voting = strat_test_set.drop('Class Name',axis=1)
voting_labels = strat_test_set['Class Name'].copy()

In [None]:
ordinal_encoder = OrdinalEncoder()
voting_cat_encoded = ordinal_encoder.fit_transform(voting)
voting_cat_encoded

In [None]:
pred = clf.predict(voting_cat_encoded)
accuracy_score(pred,voting_labels)*100

Work in progress