In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Understanding Data**

candidate: candidate names that are participating in election.

constituency: Areas where elections are being held.

party: Parties name that are participating in election.

criminal_cases: Criminal cases against candidates if there is any.

education: Education level of candidates participating in election.

total_assets: Total assets of candidates.

liabilities: Total liabilities of candidates.


After analysing the data we can see that **BJP** is on the lead in most lists.

Only **6** candidates have **Doctorate** which is the highest educational achievement.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly import tools
from plotly.subplots import make_subplots
from itertools import cycle


In [None]:
df = pd.read_csv('/kaggle/input/west-bengal-election-data/west_bengal.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df['candidate'].nunique()

In total, 561 candidates are running for the election. 

In [None]:
df['constituency'].nunique()

Elections are being held in 91 constituencies. 




In [None]:
df['constituency'].value_counts().plot.bar(figsize=(40,8))
plt.title('Constituency Name', fontsize=25)

In [None]:
constituency = df['constituency'].value_counts().head(10)

fig = px.bar(x=constituency.index, y=constituency, height=500, width=700, title='Top 10 Constituency Name')

fig.update_traces(marker = dict(color = "rgb(195, 195, 195)",
                            line = dict(color = "rgb(20, 20, 20)", width = 2)))
fig.show()

In [None]:
df['party'].nunique()

In [None]:
party = df['party'].value_counts()

fig = px.bar(x=party.index, y=party, height=600, width=1100, title='Party Distribution', text=(df['party'].value_counts()/len(df['party'])*100))

fig.update_traces(textposition='outside',texttemplate='%{text:.4s}%',marker = dict(color = "rgba(255, 0, 0, 0.7)",
                            line = dict(color = "rgb(20, 20, 20)", width = 2)))
fig.show()

Parties with the most candidates are shown here with **IND** in the lead, followed by **BJP** and **AITC** in second position. 

In [None]:
sns.distplot(x = df['criminal_cases'])

# Highest Criminial Cases

In [None]:
fig = px.histogram(x=df['criminal_cases'], nbins=70, height=500, width=900, title='Criminal Cases Distribution', log_y=True)
fig.show()

The highest number of criminal cases against a candidate is 27 however, majority of candidates don't have criminal cases against them.

# Which Party has the most Criminal Cases

In [None]:
df.groupby('party')['criminal_cases'].sum().reset_index().sort_values(by='criminal_cases', ascending=False).head(15).style.background_gradient(cmap='Blues')

**BJP** is facing highest number of criminal cases of just under **200**. 

**CPI(M)** and **AITC** have **99** and **90** cases respectively, 

The number of cases for the rest of the parties are between **31** to **0**. 

# Which Candidate has the most Crminial Cases

In [None]:
crime = df[df['criminal_cases']!=0]
crime =crime.groupby(['candidate','party'])['criminal_cases'].sum().reset_index().sort_values(by='criminal_cases',ascending=False)
crime.style.background_gradient(cmap='pink_r')

**Candidate Barun Pramanik** (Chitta) of **BJP** party has **27** criminal cases against him, the **hightest** in the list. 

**Himangshu Das** of **CPI(M)** follows with **24** and 

**Bharati Ghosh** of **BJP** has **19**. 

# Education Level

In [None]:
line_colors = ['#440154FF','#481567FF','#482677FF','#453781FF','#404788FF','39568CFF','#33638DFF','#2D708EFF','#287D8EFF','#238A8DFF','#1F968BFF']

fig = px.pie(values=df['education'].value_counts(), names=df['education'].value_counts().index, height=500)

fig.update_traces(pull=[0.0,0.0,0.2,0.0], hole=.3, hoverinfo="label+percent", marker=dict(colors=line_colors, line=dict(color='black', width=2)))

fig['layout'].update(title='Education Distribution', titlefont_size=20)

fig.show()

Looking at the education level of competing candidates, just over **one fourth** of total candidates are **graduates**. 

The percentage of **12th** pass and Post graduates are nearly the same at **18.7%** and **18.5%** respectively.

**16.6** percent have passed **10th** grade while almost **one tenth** studied till grade **8**. 

The **highest** level of educational qualification of **Doctorate** is achieved by less than **2%** of the candidates and amost **6** percent are **graduate** **professionals**. 

Each of the **remaining educational** achievements falls **below 1.5** percent. 

# Which Party has the most Educated Candidates

In [None]:
edu_party = df.groupby(['party','education'])['education'].count().reset_index(name='count').sort_values(by='count', ascending=False)
edu_party.style.background_gradient(cmap='Pastel2')

**AITC** party has **3** candidates with a **Doctorate**, which is the highest amongst all parties.

Highest number of **graduates** and **post graduates** belong to **BJP** along with **2 Doctorates**.

# Total Assets Distribution

In [None]:
sns.distplot(x = df['total_assets'])

In [None]:
fig = px.histogram(df['total_assets'], nbins=90, height=500, width=900, log_y=True, title='Total Assets Distribution')
fig.show()

Majority of candidates have total assets ranging between 0 to 89 million, but only one candidate has the highest assets worth more than 400 million.

# Which party has the Highest Assets

In [None]:
df.groupby('party')['total_assets'].sum().reset_index().sort_values(by='total_assets',ascending=False)

**AITC** tops the ranking of parties with hightest total assets at **1.3 billion dollars**, followed by **BJP** with **1.2 billion**.

**IND** is at third place with the total assets of **679 millon**. 

The lowest ranking party **JMM** has a total of **7000 dollars** worth of assets. 

# Which Candidate Possesses most Assets

In [None]:
can_assest = df.groupby(['candidate','party'])['total_assets'].sum().reset_index().sort_values(by='total_assets', ascending=False).head(50)
can_assest.style.background_gradient(cmap='BuPu')

With assets worth **430 million**, **Samsul Huda Laskar** of **IND** tops the ranks of candidates owning highest amount of assets.

**BJP** candidates reserve the **second**, **third** and **fourth** spots. 

**Pradip Sarkar** is last on the list with only **17 million** worth of assets. 

# Liabilities Distribution

In [None]:
sns.distplot(x= df['liabilities'])

In [None]:
fig = px.histogram(df['liabilities'], height=500, width=900, log_y=True)
fig.show()

# Which Party the most Liabilities 

In [None]:
party_liability = df.groupby(['party'])['liabilities'].sum().reset_index().sort_values(by='liabilities', ascending=False)
party_liability.style.background_gradient(cmap='Reds_r')

**AITC** has over **214 million** liabilities followed by **174 million** of **BJP**. 

Next on the list **IND** has **45 million** where as **CPI(M)** has **17 million**.

There are **11** parties with **zero** liabilities. 

# Which Candidate has the Most Liabilities

In [None]:
can_liability = df.groupby(['candidate','party'])['liabilities'].sum().reset_index().sort_values(by='liabilities', ascending=False).head(50)
can_liability.style.background_gradient(cmap='RdPu_r')

Although **AITC** has highest total amount of liabilities, the **candidate** with the most liabilities,**Dipankar Jana** belongs to **BJP**. He has to pay over 42 million in liabilities, 

Followed behind by **Arup Chakraborty** at **39 million**. Overall, **AITC** has **24 candidates** on the list, **BJP** has **16**, **IND** has **4**, and **CPI(M)** has **4**. 

There is only only **1** **INC** candidate **Chiranjib Bhowmik** who is at the bottom of the list with only 2 million payable liabilities. 

# Liabilities & Total Assests Of Candidates

In [None]:
can_assets = df.groupby(['candidate','party'])[['liabilities','total_assets']].sum().reset_index().sort_values(by=['liabilities','total_assets'], ascending=[False,False]).head(50)
can_assets.style.background_gradient(cmap='RdPu_r')

Comparing the candidates with highest liabilities with total assets owned by them, we can see **Dipankar Jana** with the most liabilites to pay has assets worth **144 million**. 

Second on the list, **Arup Chakaraborty** has **88 million** assets with **39 million liabilities**. 

**Samsul Huda Laskar**, who has the most assets out of all the candidates, has liabilities of **5 million**. 

Second highest asset owner **Bharati Ghosh** has over **7 million** liabilites to pay. 

# Correlation Heatmap

In [None]:
sns.heatmap(df.corr(), annot=True, cmap='viridis')

There is a linear correlation between liabilities and total assets.

In [None]:
fig = px.scatter(df['total_assets'], color=df['liabilities'], height=500, width=700, trendline='ols')
fig.show()

In [None]:
sns.regplot(x=df['liabilities'], y=df['total_assets'], color='g')