In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from plotly.subplots import make_subplots

In [None]:
df = pd.read_csv('../input/indian-candidates-for-general-election-2019/LS_2.0.csv')

In [None]:
df.head()

In [None]:

def clean(text):
    text = text.lower()
    text = re.sub(r'\n','',text)
    return text

In [None]:
cols= [clean(i) for i in df.columns.tolist() ]

In [None]:
cols

In [None]:
cols[cols.index('criminalcases')] = 'cases'
cols[cols.index('over total electors in constituency')] = 'constituency electors'
cols[cols.index('over total votes polled in constituency')] = 'constituency votes'

In [None]:
df1 = df.copy()

In [None]:
df1.columns = cols

In [None]:
df1.head()

In [None]:
df1.info()

In [None]:
df1.liabilities.value_counts()

In [None]:
df1.fillna(0,inplace=True)

In [None]:
def get_money(x):
    try:
        return re.sub(',','',x.split('~')[0].split()[1])
    except:
        return 0

In [None]:
df1['assets'] = df1.assets.replace({'Not Available':'0','Nil':'0'})
df1['liabilities'] = df1.liabilities.replace({'Not Available':'0','Nil':'0'})
df1['cases'] = df1.cases.replace({'Not Available':'0','Nil':'0'})

In [None]:
df1.assets = pd.to_numeric(df1.assets.apply(lambda x : get_money(x)))
df1.liabilities = pd.to_numeric(df1.liabilities.apply(lambda x : get_money(x)))

In [None]:
df1.cases = pd.to_numeric(df1.cases)

In [None]:
df1.education.replace({'Post Graduate\n':'Post Graduate'},inplace=True)

# Starting Analysis 

In [None]:
df1.head()

In [None]:
len(cols)

In [None]:
winners = df1[df1.winner ==1]

In [None]:
fig = make_subplots(rows=4,cols=5,subplot_titles=winners.columns.tolist())
i = j  = 1
for idx in range(len(winners.columns)) :
    try:
    
        counts = pd.DataFrame(winners[cols[idx]].value_counts().head(5))
        
        
        fig.add_trace(go.Bar( x=counts.index.tolist(),y=counts[cols[idx]].tolist()),row = i,col = j)
        j = j + 1
        if j > 5:
            j = 1
            i = i + 1
    except:
        pass
fig.update_layout(height=800)
fig.show()

Conclusions from above graphs for winners:

1) BJP had the majoirity of the winning candidates.

2) Significant number of winners were educated to college level. This shows people trust in educated candidates.

3) More than 50% percent winners didnt have criminal cases which is a good sign.

4) Seems like average age of the winning leader is 55.

5) Not many women candidate won the elections compared to men.

6) General Category candidates had the majority winning share.

#Lets see what % of constitunencies were won by BJP.

In [None]:
bjp = df1[df1.party == 'BJP']
bjp.head()

In [None]:
#state with number of consituencies greater than 10
cons = pd.DataFrame(bjp.state.value_counts()>10)
cons = cons[cons.state==True].index.tolist()

In [None]:
votesrec = bjp[bjp.state.isin(cons)]
votesrec['percentvotes'] = round(votesrec['totalvotes']/votesrec['total electors'] * 100,2)
votesrec

In [None]:
d1 = votesrec.groupby('state').agg({'constituency':'count'})
d2 = votesrec[votesrec.winner == 1]
d3 = d2.groupby('state').agg({'constituency':'count'})
d3.loc['Kerala'] = 0
d4 = pd.Series(d3.constituency/d1.constituency*100)
px.bar(data_frame=d4)

In [None]:
n = len(votesrec[votesrec.winner == 0])
p = len(votesrec[votesrec.winner == 1])
print("Percentage of Constituencies Won in each states with no. of constituencies > 10 :")
for i in d4.items() :
    print("{} : {}%".format(i[0],round(i[1],2)))
px.scatter(data_frame=votesrec,x='totalvotes',y='total electors',color='winner',hover_data=['state','constituency','party','percentvotes']
          ,title= "States with constituencies >10; Won :{} and Lost : {}".format(p,n))


#We can conclude from the above graph that the BJP were popular where the total number of electors were greater than 1.5M.
#These areas seem to be mostly urban. Thus thier success rate is higher in cities. They majorly work for the benefits and
#development of cities and thus win more votes for themselves.

#Now lets look at BJP data and why they were so successful in the campaign.

In [None]:
bjp.shape 

In [None]:
px.histogram(data_frame=bjp,x='winner',opacity=0.3)

#did assets and liabilities matter in victory?

In [None]:
px.scatter(data_frame = bjp,x='liabilities',y='assets',color='winner')

#Thus from above to graphs of liabilities vs assets, we can conclude that having higher liabilities cost had high winning
#rate. Meanwhile if it is just high assets, winning chances arent very high. This might be because these leaders took 
#money from people and invested it in assets rather than focusing on development.

#What percent of bjp leaders were educated and won ?

In [None]:
dict ={}
for i in bjp.education.unique().tolist():
    try:
        temp = bjp[bjp.education == i]['winner'].value_counts()[1] / (bjp[bjp.education == i]['winner'].value_counts()[1] + 
                                                                    bjp[bjp.education == i]['winner'].value_counts()[0]) *100
        dict[i] = round(temp,2)
    except:
        dict[i] = 0


In [None]:
d1 = pd.DataFrame.from_dict(dict,orient='index')

In [None]:
bjp = pd.merge(bjp,d1,right_index=True,left_on='education').rename({0:'educationwinpercent'},axis=1)
bjp['educationwinpercent'] = bjp['educationwinpercent'].astype('str')+'%'

In [None]:
education = bjp.groupby(['education']).agg({'winner':'sum','name':'count'})
education.rename({'winner':'Candidates Won','name':'Total Candidates'},axis=1,inplace=True)
px.bar(education,barmode='group')

#Above chart shows the distribution of bjp candidates with a certain level of education.Red indicates total no of candidates
#with an education while blue shows the number who got elected.

In [None]:
px.pie(data_frame=bjp,names='education',hover_data=['educationwinpercent'])

#From the above pie chart, we can conclude that most of the candidates who stood for election for bjp were hihgly educated.
#Alongside this, all the highly educated candidates had a winning rate of more than 70%. But winning percent doesnt seem tobe
#affected by level of education as even candidates below 10th grade education had 70% candidates winning the election.
#Still it is good to see educated people taking a stand in the world of politics.

In [None]:
df1.head()

In [None]:
#Lets explore criminal background of the candidates

In [None]:
l=  0
c = [i for i in df1.cases if i < 4 ]
print(f"Total Canditates : {len(df1)}")
print(f"Candidates with less than 4 cases : {len(c)}")
print(f"Candidates with more than 4 cases : {len(df1) - len(c)}")


In [None]:
#Distribution of 218 candidates
px.histogram(df1[ (df1.cases > 4)],'cases',nbins=40)

#From above data, we can still see that many candidates have some criminal cases registered to their name. Considering our
#country, cases against politicians hardly surface. Thus we can also say that even though a candidate has less registered
#cases, having some cases registered to their name means they do take unethical paths to get their jobs done and might be  
#involved in more criminal activity that hasn't surfaced yet.

#What about the winners?

In [None]:
cases = df1.copy()
cases['yes'] = cases.cases > 0

In [None]:
len(cases)

In [None]:
c1 = len(cases[(cases.yes == False) & (cases.winner == 0)])
c2 = len(cases[(cases.yes == False) & (cases.winner == 1)])
l = len(cases)
nl = '\n'
new_df =pd.DataFrame.from_dict( {'Losing Candidates with no cases':[c1],
          'Losing candidates atleast one case' : [l-c1],
          'Winning Candidates with no cases' : [c2],
          'Winning candidates atleast one case' : [l-c2]
         }).T
print(f"Losing Candidates with no cases : {c1} Losing candidates atleast one case : {l-c1}{nl}Winning Candidates with no cases : {c2} Winning candidates atleast one case : {l-c2}")
px.bar(cases,'winner','cases',color='cases',\
title=f"",
    color_continuous_scale='viridis',barmode='relative')

In [None]:
new_df

In [None]:
px.pie(data_frame = new_df,values=0,names=new_df.index)

#From above graphs, we can see that most of the elected candidates did have some amount of criminal activity on going. It is
#quite shocking to see that only 306 candidates have a clean background. This points towards corruption in our current 
#system where 1200 candidates who do thier work ethically couldn't get nominated. We need to overcome this as it is really
#not beneficial for our system to be governed by criminals

In [None]:
df1.head()

#Lets us see how all the parties performed in the election

In [None]:
df1.groupby(['party']).agg({'state':'count'}).sort_values(by='state',ascending=False)[:10]
px.bar(df1.groupby(['party']).agg({'state':'count'}).sort_values(by='state',ascending=False)[:10],
title = "Parties with Most number of Candidates",labels = {"party":"Party Name","value":"No. of Candidates"})

In [None]:
all = df1.groupby(['party']).agg({'state':'count'})
win = winners.groupby('party').agg({'state':'count'})
for i in all.index.tolist():
    if i not in win.index.tolist():
        win.loc[i] = 0
new_df1 = pd.merge(all,win,left_index=True,right_index=True).sort_values(by='state_y',ascending=False)[:15]
new_df1['ratio'] = round(new_df1['state_y'] / new_df1['state_x'] * 100,2)

In [None]:
new_df1

#Finally , let us see the how many candidates who stood for election vs candidates elected
#from parties with maximum candidates standing
#for election was.

In [None]:
px.bar(new_df1,y=['state_x','state_y'],color='ratio',barmode='overlay',opacity = 0.7,
       labels={'party':'Winning Parties with total no. of candidates and conversion Ratio',
              'value':'Total no. of winning vs Total no. of Candidates','ratio':'Conversion Ratio'})

#From above, we can see DMK and LJP had huge success with all thier candidates getting elected. INC, BSP,CPI and IND despte of 
#heavy campainging failed to gain much support. Rest others did fairly decent ie more than 50% conversion ratio.

#Conclusions drawn:

1) Many of the elected members very educated. 

2) A lot of Elected candidates had criminal cases which doesnt reflect a good image.

3) Liabilites tend to higher success rate than assets which was a surprising insight!

4) BJP had highest conversion ratio. It is always a notable fact that many local parties were largely supported by 
    local supporters.

5) BJP ,who had the maximum elected candidates, it can be seen that their major focus of development were in cities and thus rural parties supported local parties more.