### Testing only on 2014 dataset

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, chi2

In [2]:
dataset = pd.read_csv('Datasets/combined2014.csv')
dataset

Unnamed: 0.1,Unnamed: 0,Candidate,Party,CriminalCases,Education,Age,Gender,TotalAssets,Liabilities,Winner,State,Constituency,Year,Category,Votes,ValidVotes,VoteSharePercentage,PartyID
0,0,bishnupadaray,BJP,1,Graduate,64,M,5658980.0,175115,1,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2014,GEN,90969,190328,47.80,1605
1,1,anitamondal,AITC,1,Graduate,48,F,5471073.0,6000,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2014,GEN,2283,190328,1.20,18228
2,2,apandian,AIFB,0,12th Pass,41,M,859410.0,350000,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2014,GEN,225,190328,0.12,3040
3,3,cgsajikumar,IND,0,12th Pass,41,M,50000.0,0,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2014,GEN,334,190328,0.18,10809
4,4,kgdas,CPI(M),0,12th Pass,70,M,4221392.0,0,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2014,GEN,1777,190328,0.93,14635
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5530,5530,minatisarkar,SUCI(C),0,Post Graduate,54,F,275942.0,0,0,west_bengal,uluberia,2014,GEN,2902,1186027,0.24,8082
5531,5531,rameshdhara,IND,1,10th Pass,58,M,540917.0,150000,0,west_bengal,uluberia,2014,SC,2529,1186027,0.21,10809
5532,5532,ranjitkishoremohanty,BJP,0,Post Graduate,68,M,42413560.0,1362846,0,west_bengal,uluberia,2014,GEN,137137,1186027,11.56,1605
5533,5533,rekhadas,BSP,0,8th Pass,38,F,27477.0,0,0,west_bengal,uluberia,2014,SC,3918,1186027,0.33,16651


In [3]:
dataset.drop("Unnamed: 0", axis=1, inplace=True)

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5535 entries, 0 to 5534
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Candidate            5535 non-null   object 
 1   Party                5535 non-null   object 
 2   CriminalCases        5535 non-null   int64  
 3   Education            5535 non-null   object 
 4   Age                  5535 non-null   int64  
 5   Gender               5532 non-null   object 
 6   TotalAssets          5487 non-null   float64
 7   Liabilities          5535 non-null   int64  
 8   Winner               5535 non-null   int64  
 9   State                5535 non-null   object 
 10  Constituency         5535 non-null   object 
 11  Year                 5535 non-null   int64  
 12  Category             5530 non-null   object 
 13  Votes                5535 non-null   int64  
 14  ValidVotes           5535 non-null   int64  
 15  VoteSharePercentage  5535 non-null   f

In [5]:
dataset.isna().sum()

Candidate               0
Party                   0
CriminalCases           0
Education               0
Age                     0
Gender                  3
TotalAssets            48
Liabilities             0
Winner                  0
State                   0
Constituency            0
Year                    0
Category                5
Votes                   0
ValidVotes              0
VoteSharePercentage     0
PartyID                 0
dtype: int64

In [6]:
dataset["Party"].value_counts('')

IND                               2129
BSP                                333
BJP                                329
INC                                317
AAP                                275
                                  ... 
Atulya Bharat Party                  1
Bhartiya Jan Manch                   1
Rashtriya Jankranti Party            1
Poorvanchal Rashtriya Congress       1
Nirjatita Samaj Biplabi Party        1
Name: Party, Length: 388, dtype: int64

In [7]:
dataset['TotalAssets'].fillna(0, inplace=True) # Fill NaN values of Candidates with 0

In [8]:
#dataset.dropna(inplace=True) # Code to drop na values

In [9]:
dataset[dataset["Category"].isna()]

Unnamed: 0,Candidate,Party,CriminalCases,Education,Age,Gender,TotalAssets,Liabilities,Winner,State,Constituency,Year,Category,Votes,ValidVotes,VoteSharePercentage,PartyID
2860,kulamaniurma,Paschimanchal Vikas Party,0,8th Pass,52,,153000.0,0,0,orissa,bargarh,2014,,5114,764874,0.67,345
4167,sarveysathyanarayana,INC,2,Graduate Professional,60,M,32275736.0,4195312,0,telangana,malkajgiri,2014,,156315,1026732,15.22,3482
5238,debeshdas,CPI(M),0,Graduate Professional,54,,4546171.0,0,0,west_bengal,bangaon,2014,,328214,1236222,26.55,14635
5241,pranitamandal,IND,0,Graduate Professional,46,,290970.0,0,0,west_bengal,bangaon,2014,,8598,1236222,0.7,10809
5534,sabiruddinmolla,CPI(M),0,Graduate,39,M,178532.0,0,0,west_bengal,uluberia,2014,,138892,1199494,11.58,14635


In [10]:
dataset.reset_index(inplace=True, drop=True)

In [11]:
dataset.Category  = dataset.Category.astype("str")

In [12]:
# label encode categorical columns

lblEncoder_state = LabelEncoder()
lblEncoder_state.fit(dataset['State'])
dataset['State'] = lblEncoder_state.transform(dataset['State'])

lblEncoder_cons = LabelEncoder()
lblEncoder_cons.fit(dataset['Constituency'])
dataset['Constituency'] = lblEncoder_cons.transform(dataset['Constituency'])

# lblEncoder_name = LabelEncoder()
# lblEncoder_name.fit(dataset['Candidate'])
# dataset['Candidate'] = lblEncoder_name.transform(dataset['Candidate'])

lblEncoder_party = LabelEncoder()
lblEncoder_party.fit(dataset['Party'])
dataset['Party'] = lblEncoder_party.transform(dataset['Party'])

lblEncoder_category = LabelEncoder()
lblEncoder_category.fit(dataset['Category'])
dataset['Category'] = lblEncoder_category.transform(dataset['Category'])

lblEncoder_edu = LabelEncoder()
lblEncoder_edu.fit(dataset['Education'])
dataset['Education'] = lblEncoder_edu.transform(dataset['Education'])

lblEncoder_gen = LabelEncoder()
lblEncoder_gen.fit(dataset['Gender'])
dataset['Gender'] = lblEncoder_gen.transform(dataset['Gender'])

In [13]:
dataset.drop(["Candidate", "Year", "Votes", "ValidVotes", "PartyID", "VoteSharePercentage"], inplace=True, axis=1)

In [14]:
# separate train features and label
y = dataset["Winner"]
X = dataset.drop(labels=["Winner"], axis=1)
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
# train and test knn model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.predict(X_test)
print("Testing Accuracy is: ", knn.score(X_test, y_test)*100, "%")

Testing Accuracy is:  90.33423667570008 %


In [15]:
# scaling values into 0-1 range
scaler = MinMaxScaler(feature_range=(0, 1))
features = ['State', 'Constituency', 'Party', 'CriminalCases', 'Age', 'Category', 'Education', 'TotalAssets', 
            'Liabilities']
dataset[features] = scaler.fit_transform(dataset[features])

In [16]:
# separate train features and label
y = dataset["Winner"]
X = dataset.drop(labels=["Winner"], axis=1)
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
# train and test knn model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.predict(X_test)
print("Testing Accuracy is: ", knn.score(X_test, y_test)*100, "%")

Testing Accuracy is:  91.32791327913279 %


### Using on combined dataset - 1st Pass

In [17]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, chi2

In [18]:
dataset = pd.read_csv('Datasets/combined_all.csv')
dataset

Unnamed: 0.1,Unnamed: 0,Candidate,Party,CriminalCases,Education,Age,Gender,TotalAssets,Liabilities,Winner,State,Constituency,Year,Category,Votes,ValidVotes,VoteSharePercentage,PartyID
0,0,manoranjanbhakta,INC,0,Post Graduate,65,M,5926740.0,272061,1,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,85794,153825,55.77,3482
1,1,akbiswas,BSP,0,Graduate,61,M,7876500.0,65000,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,1122,153825,0.73,16651
2,2,asitbarandutta,IND,0,Graduate,50,M,3070000.0,0,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,523,153825,0.34,10809
3,3,bishnupadaray,BJP,0,Graduate,54,M,1250619.0,128710,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,55294,153825,35.95,1605
4,4,deepakbiswas,IND,0,Not Given,28,M,3000.0,0,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,1186,153825,0.77,10809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20243,5497,durgadashajra,IND,0,12th Pass,62,M,1325108.0,0,0,west_bengal,uluberia,2019,,6770,1311099,0.52,10809
20244,5498,joybanerjee,BJP,2,Graduate,56,M,246933.0,0,0,west_bengal,uluberia,2019,GEN,479586,1311099,36.58,1605
20245,5499,maksudakhatun,CPI(M),0,Doctorate,58,F,12578736.0,2876353,0,west_bengal,uluberia,2019,,81314,1311099,6.20,14635
20246,5500,minatisarkar,SUCI(C),0,Graduate,61,F,1091556.0,0,0,west_bengal,uluberia,2019,GEN,1697,1311099,0.13,8082


In [19]:
dataset.drop("Unnamed: 0", axis=1, inplace=True)

In [20]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20248 entries, 0 to 20247
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Candidate            20248 non-null  object 
 1   Party                20248 non-null  object 
 2   CriminalCases        20248 non-null  int64  
 3   Education            20248 non-null  object 
 4   Age                  20248 non-null  int64  
 5   Gender               20234 non-null  object 
 6   TotalAssets          19628 non-null  float64
 7   Liabilities          20248 non-null  int64  
 8   Winner               20248 non-null  int64  
 9   State                20248 non-null  object 
 10  Constituency         20248 non-null  object 
 11  Year                 20248 non-null  int64  
 12  Category             16918 non-null  object 
 13  Votes                20248 non-null  int64  
 14  ValidVotes           20248 non-null  int64  
 15  VoteSharePercentage  20248 non-null 

In [21]:
dataset.isna().sum()

Candidate                 0
Party                     0
CriminalCases             0
Education                 0
Age                       0
Gender                   14
TotalAssets             620
Liabilities               0
Winner                    0
State                     0
Constituency              0
Year                      0
Category               3330
Votes                     0
ValidVotes                0
VoteSharePercentage       0
PartyID                   0
dtype: int64

In [22]:
dataset.groupby("Category").count()

Unnamed: 0_level_0,Candidate,Party,CriminalCases,Education,Age,Gender,TotalAssets,Liabilities,Winner,State,Constituency,Year,Votes,ValidVotes,VoteSharePercentage,PartyID
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
GEN,11076,11076,11076,11076,11076,11076,10745,11076,11076,11076,11076,11076,11076,11076,11076,11076
Gen,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
SC,4421,4421,4421,4421,4421,4421,4205,4421,4421,4421,4421,4421,4421,4421,4421,4421
ST,1420,1420,1420,1420,1420,1420,1376,1420,1420,1420,1420,1420,1420,1420,1420,1420


In [23]:
dataset[dataset["Category"].isna()]

Unnamed: 0,Candidate,Party,CriminalCases,Education,Age,Gender,TotalAssets,Liabilities,Winner,State,Constituency,Year,Category,Votes,ValidVotes,VoteSharePercentage,PartyID
325,ataurrahman,SAP,0,10th Pass,55,M,70000.0,0,0,assam,kaliabor,2004,,12328,862871,1.43,10809
1105,abhimanyu,BJP,0,Graduate,36,M,44375405.0,0,0,haryana,rohtak,2004,,123116,531022,23.18,1605
1121,ramphal,IND,0,10th Pass,56,M,276518.0,0,0,haryana,rohtak,2004,,763,531022,0.14,10809
1552,vssivakumar,INC,0,Post Graduate,43,M,1982109.0,290004,0,kerala,trivandrum,2004,,4467,759237,0.59,10809
1581,narendrasingh,IND,1,8th Pass,0,M,250000.0,0,0,madhya_pradesh,bhind,2004,,254,531022,0.05,10809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20240,sajdaahmed,AITC,0,Graduate,57,F,28351569.0,5322615,1,west_bengal,uluberia,2019,,694945,1311099,53.00,18228
20242,amalbarman,IND,0,10th Pass,42,M,2437609.0,0,0,west_bengal,uluberia,2019,,1885,1311099,0.14,10809
20243,durgadashajra,IND,0,12th Pass,62,M,1325108.0,0,0,west_bengal,uluberia,2019,,6770,1311099,0.52,10809
20245,maksudakhatun,CPI(M),0,Doctorate,58,F,12578736.0,2876353,0,west_bengal,uluberia,2019,,81314,1311099,6.20,14635


In [24]:
dataset['TotalAssets'].fillna(0, inplace=True) # Fill NaN values of Candidates with 0

In [25]:
#dataset.dropna(inplace=True) # Code to drop na values

In [26]:
dataset["Category"].fillna('GEN', inplace=True)

In [27]:
dataset["Category"].replace(to_replace='Gen', value='GEN', inplace=True)

In [28]:
dataset.reset_index(inplace=True, drop=True)

In [29]:
dataset['Education'].value_counts()

Graduate                 3917
Post Graduate            3404
10th Pass                2864
12th Pass                2685
Graduate Professional    1942
8th Pass                 1537
Not Given                1014
Literate                  840
5th Pass                  788
Others                    545
Doctorate                 473
Illiterate                239
Name: Education, dtype: int64

In [30]:
dataset.Category  =  dataset.Category.astype("str")

In [31]:
# label encode categorical columns

lblEncoder_state = LabelEncoder()
lblEncoder_state.fit(dataset['State'])
dataset['State'] = lblEncoder_state.transform(dataset['State'])

lblEncoder_cons = LabelEncoder()
lblEncoder_cons.fit(dataset['Constituency'])
dataset['Constituency'] = lblEncoder_cons.transform(dataset['Constituency'])

lblEncoder_name = LabelEncoder()
lblEncoder_name.fit(dataset['Candidate'])
dataset['Candidate'] = lblEncoder_name.transform(dataset['Candidate'])

lblEncoder_party = LabelEncoder()
lblEncoder_party.fit(dataset['Party'])
dataset['Party'] = lblEncoder_party.transform(dataset['Party'])

lblEncoder_category = LabelEncoder()
lblEncoder_category.fit(dataset['Category'])
dataset['Category'] = lblEncoder_category.transform(dataset['Category'])

lblEncoder_edu = LabelEncoder()
lblEncoder_edu.fit(dataset['Education'])
dataset['Education'] = lblEncoder_edu.transform(dataset['Education'])

lblEncoder_gen = LabelEncoder()
lblEncoder_gen.fit(dataset['Gender'])
dataset['Gender'] = lblEncoder_gen.transform(dataset['Gender'])

In [32]:
dataset.drop(["Candidate", "Year", "Votes", "ValidVotes", "PartyID", "VoteSharePercentage"], inplace=True, axis=1)

In [33]:
# separate train features and label
y = dataset["Winner"]
X = dataset.drop(labels=["Winner"], axis=1)
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
# train and test knn model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.predict(X_test)
print("Testing Accuracy is: ", knn.score(X_test, y_test)*100, "%")

Testing Accuracy is:  88.88888888888889 %


In [34]:
# scaling values into 0-1 range
scaler = MinMaxScaler(feature_range=(0, 1))
features = ['State', 'Constituency', 'Party', 'CriminalCases', 'Age', 'Category', 'Education', 'TotalAssets', 
            'Liabilities']
dataset[features] = scaler.fit_transform(dataset[features])

In [35]:
# separate train features and label
y = dataset["Winner"]
X = dataset.drop(labels=["Winner"], axis=1)
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
# train and test knn model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.predict(X_test)
print("Testing Accuracy is: ", knn.score(X_test, y_test)*100, "%")

Testing Accuracy is:  89.60493827160494 %


### Improving Accuracy - 2nd Pass

In [36]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, chi2

In [37]:
dataset = pd.read_csv('Datasets/combined_all.csv')
dataset

Unnamed: 0.1,Unnamed: 0,Candidate,Party,CriminalCases,Education,Age,Gender,TotalAssets,Liabilities,Winner,State,Constituency,Year,Category,Votes,ValidVotes,VoteSharePercentage,PartyID
0,0,manoranjanbhakta,INC,0,Post Graduate,65,M,5926740.0,272061,1,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,85794,153825,55.77,3482
1,1,akbiswas,BSP,0,Graduate,61,M,7876500.0,65000,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,1122,153825,0.73,16651
2,2,asitbarandutta,IND,0,Graduate,50,M,3070000.0,0,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,523,153825,0.34,10809
3,3,bishnupadaray,BJP,0,Graduate,54,M,1250619.0,128710,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,55294,153825,35.95,1605
4,4,deepakbiswas,IND,0,Not Given,28,M,3000.0,0,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,1186,153825,0.77,10809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20243,5497,durgadashajra,IND,0,12th Pass,62,M,1325108.0,0,0,west_bengal,uluberia,2019,,6770,1311099,0.52,10809
20244,5498,joybanerjee,BJP,2,Graduate,56,M,246933.0,0,0,west_bengal,uluberia,2019,GEN,479586,1311099,36.58,1605
20245,5499,maksudakhatun,CPI(M),0,Doctorate,58,F,12578736.0,2876353,0,west_bengal,uluberia,2019,,81314,1311099,6.20,14635
20246,5500,minatisarkar,SUCI(C),0,Graduate,61,F,1091556.0,0,0,west_bengal,uluberia,2019,GEN,1697,1311099,0.13,8082


In [38]:
dataset.drop("Unnamed: 0", axis=1, inplace=True)

In [39]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20248 entries, 0 to 20247
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Candidate            20248 non-null  object 
 1   Party                20248 non-null  object 
 2   CriminalCases        20248 non-null  int64  
 3   Education            20248 non-null  object 
 4   Age                  20248 non-null  int64  
 5   Gender               20234 non-null  object 
 6   TotalAssets          19628 non-null  float64
 7   Liabilities          20248 non-null  int64  
 8   Winner               20248 non-null  int64  
 9   State                20248 non-null  object 
 10  Constituency         20248 non-null  object 
 11  Year                 20248 non-null  int64  
 12  Category             16918 non-null  object 
 13  Votes                20248 non-null  int64  
 14  ValidVotes           20248 non-null  int64  
 15  VoteSharePercentage  20248 non-null 

In [40]:
dataset.isna().sum()

Candidate                 0
Party                     0
CriminalCases             0
Education                 0
Age                       0
Gender                   14
TotalAssets             620
Liabilities               0
Winner                    0
State                     0
Constituency              0
Year                      0
Category               3330
Votes                     0
ValidVotes                0
VoteSharePercentage       0
PartyID                   0
dtype: int64

In [41]:
dataset[dataset["Category"].isna()]

Unnamed: 0,Candidate,Party,CriminalCases,Education,Age,Gender,TotalAssets,Liabilities,Winner,State,Constituency,Year,Category,Votes,ValidVotes,VoteSharePercentage,PartyID
325,ataurrahman,SAP,0,10th Pass,55,M,70000.0,0,0,assam,kaliabor,2004,,12328,862871,1.43,10809
1105,abhimanyu,BJP,0,Graduate,36,M,44375405.0,0,0,haryana,rohtak,2004,,123116,531022,23.18,1605
1121,ramphal,IND,0,10th Pass,56,M,276518.0,0,0,haryana,rohtak,2004,,763,531022,0.14,10809
1552,vssivakumar,INC,0,Post Graduate,43,M,1982109.0,290004,0,kerala,trivandrum,2004,,4467,759237,0.59,10809
1581,narendrasingh,IND,1,8th Pass,0,M,250000.0,0,0,madhya_pradesh,bhind,2004,,254,531022,0.05,10809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20240,sajdaahmed,AITC,0,Graduate,57,F,28351569.0,5322615,1,west_bengal,uluberia,2019,,694945,1311099,53.00,18228
20242,amalbarman,IND,0,10th Pass,42,M,2437609.0,0,0,west_bengal,uluberia,2019,,1885,1311099,0.14,10809
20243,durgadashajra,IND,0,12th Pass,62,M,1325108.0,0,0,west_bengal,uluberia,2019,,6770,1311099,0.52,10809
20245,maksudakhatun,CPI(M),0,Doctorate,58,F,12578736.0,2876353,0,west_bengal,uluberia,2019,,81314,1311099,6.20,14635


In [42]:
dataset['TotalAssets'].fillna(dataset.TotalAssets.mean(), inplace=True) # Fill NaN values of Candidates with mean of TotalAssets

In [43]:
#dataset.dropna(inplace=True) # Code to drop na values
dataset.isna().sum()

Candidate                 0
Party                     0
CriminalCases             0
Education                 0
Age                       0
Gender                   14
TotalAssets               0
Liabilities               0
Winner                    0
State                     0
Constituency              0
Year                      0
Category               3330
Votes                     0
ValidVotes                0
VoteSharePercentage       0
PartyID                   0
dtype: int64

In [44]:
dataset.dropna(inplace=True) # Dropping na values

In [45]:
dataset.reset_index(inplace=True, drop=True)

In [46]:
dataset.drop(["Candidate", "Year", "Votes", "ValidVotes", "PartyID", "VoteSharePercentage"], inplace=True, axis=1)

In [47]:
dataset['Education'].value_counts()

Graduate                 3359
Post Graduate            2889
10th Pass                2292
12th Pass                2159
Graduate Professional    1603
8th Pass                 1216
Not Given                1003
Literate                  719
5th Pass                  644
Others                    439
Doctorate                 425
Illiterate                170
Name: Education, dtype: int64

In [48]:
# encode education column
encoded_edu = []
# iterate through each row in the dataset
for row in dataset.itertuples():
    education = row.Education
    
    if education == "Illiterate":
        encoded_edu.append(0)
    elif education == "Literate":
        encoded_edu.append(1)
    elif education == "5th Pass":
        encoded_edu.append(2)
    elif education == "8th Pass":
        encoded_edu.append(3)
    elif education == "10th Pass":
        encoded_edu.append(4)
    elif education == "12th Pass":
        encoded_edu.append(7)
    elif education == "Graduate":
        encoded_edu.append(8)
    elif education == "Post Graduate":
        encoded_edu.append(9)
    elif education == "Graduate Professional":
        encoded_edu.append(10)
    elif education == "Doctorate":
        encoded_edu.append(11)
    else:
        encoded_edu.append(5)

dataset['Education'] = encoded_edu


In [49]:
dataset['Party'].value_counts()

IND                                       6631
INC                                       1242
BJP                                       1240
BSP                                       1129
SP                                         467
                                          ... 
All J & K Kisan Majdoor Party                1
JKPC                                         1
All Jammu and Kashmir Republican Party       1
Himachal Swabhiman Party                     1
Hindustan Swaraj Congress Party              1
Name: Party, Length: 925, dtype: int64

In [50]:
# change party of the less frequent parties as Other
# 'BJP','INC','IND','BSP', 'CPI(M)', 'AITC', 'MNM': high frequent
# 'TDP', 'VSRCP', 'SP', 'DMK', 'BJD': medium frequent
dataset.loc[~dataset["Party"].isin(['BJP','INC','IND','BSP', 'CPI(M)', 'AITC', 'MNM', 'TDP', 'VSRCP', 'SP', 'DMK', 'BJD']), 
            "Party"] = "Other"
dataset['Party'].value_counts()

IND       6631
Other     5733
INC       1242
BJP       1240
BSP       1129
SP         467
AITC       179
CPI(M)      88
TDP         79
DMK         73
BJD         57
Name: Party, dtype: int64

In [51]:
dataset.Category = dataset.Category.astype("str")

In [52]:
# label encode categorical columns

lblEncoder_state = LabelEncoder()
lblEncoder_state.fit(dataset['State'])
dataset['State'] = lblEncoder_state.transform(dataset['State'])

lblEncoder_cons = LabelEncoder()
lblEncoder_cons.fit(dataset['Constituency'])
dataset['Constituency'] = lblEncoder_cons.transform(dataset['Constituency'])

# lblEncoder_name = LabelEncoder()
# lblEncoder_name.fit(dataset['Candidate'])
# dataset['Candidate'] = lblEncoder_name.transform(dataset['Candidate'])

lblEncoder_party = LabelEncoder()
lblEncoder_party.fit(dataset['Party'])
dataset['Party'] = lblEncoder_party.transform(dataset['Party'])

lblEncoder_category = LabelEncoder()
lblEncoder_category.fit(dataset['Category'])
dataset['Category'] = lblEncoder_category.transform(dataset['Category'])

lblEncoder_category = LabelEncoder()
lblEncoder_category.fit(dataset['Gender'])
dataset['Gender'] = lblEncoder_category.transform(dataset['Gender'])

lblEncoder_edu = LabelEncoder()
lblEncoder_edu.fit(dataset['Education'])
dataset['Education'] = lblEncoder_edu.transform(dataset['Education'])

In [53]:
# separate train features and label
y = dataset["Winner"]
X = dataset.drop(labels=["Winner"], axis=1)
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
# train and test knn model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.predict(X_test)
print("Testing Accuracy is: ", knn.score(X_test, y_test)*100, "%")

Testing Accuracy is:  88.12056737588652 %


In [54]:
# scaling values into 0-1 range
scaler = MinMaxScaler(feature_range=(0, 1))
features = ['State', 'Constituency', 'Party', 'CriminalCases', 'Age', 'Category', 'Education', 'TotalAssets', 
            'Liabilities']
dataset[features] = scaler.fit_transform(dataset[features])

In [55]:
# separate train features and label
y = dataset["Winner"]
X = dataset.drop(labels=["Winner"], axis=1)
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
# train and test knn model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.predict(X_test)
print("Testing Accuracy is: ", knn.score(X_test, y_test)*100, "%")

Testing Accuracy is:  89.42080378250591 %


In [56]:
# apply SelectKBest class to extract top most features

bestfeatures = SelectKBest(score_func=chi2, k='all')
fit = bestfeatures.fit(X, y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

# concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs', 'Score']
print(featureScores.nlargest(30, 'Score'))

           Specs       Score
0          Party  104.155477
2      Education   52.422527
6    Liabilities   21.649513
9       Category   20.612833
3            Age   19.441892
1  CriminalCases   10.324129
5    TotalAssets    2.678589
4         Gender    2.112150
7          State    1.424647
8   Constituency    0.032332


# Training Logistic Regression 

In [132]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, chi2, f_classif

In [84]:
dataset = pd.read_csv('Datasets/combined_all.csv')
dataset

Unnamed: 0.1,Unnamed: 0,Candidate,Party,CriminalCases,Education,Age,Gender,TotalAssets,Liabilities,Winner,State,Constituency,Year,Category,Votes,ValidVotes,VoteSharePercentage,PartyID
0,0,manoranjanbhakta,INC,0,Post Graduate,65,M,5926740.0,272061,1,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,85794,153825,55.77,3482
1,1,akbiswas,BSP,0,Graduate,61,M,7876500.0,65000,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,1122,153825,0.73,16651
2,2,asitbarandutta,IND,0,Graduate,50,M,3070000.0,0,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,523,153825,0.34,10809
3,3,bishnupadaray,BJP,0,Graduate,54,M,1250619.0,128710,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,55294,153825,35.95,1605
4,4,deepakbiswas,IND,0,Not Given,28,M,3000.0,0,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,1186,153825,0.77,10809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20243,5497,durgadashajra,IND,0,12th Pass,62,M,1325108.0,0,0,west_bengal,uluberia,2019,,6770,1311099,0.52,10809
20244,5498,joybanerjee,BJP,2,Graduate,56,M,246933.0,0,0,west_bengal,uluberia,2019,GEN,479586,1311099,36.58,1605
20245,5499,maksudakhatun,CPI(M),0,Doctorate,58,F,12578736.0,2876353,0,west_bengal,uluberia,2019,,81314,1311099,6.20,14635
20246,5500,minatisarkar,SUCI(C),0,Graduate,61,F,1091556.0,0,0,west_bengal,uluberia,2019,GEN,1697,1311099,0.13,8082


In [85]:
dataset.drop(["Unnamed: 0"],axis = 1,inplace = True)

In [86]:
dataset.head()

Unnamed: 0,Candidate,Party,CriminalCases,Education,Age,Gender,TotalAssets,Liabilities,Winner,State,Constituency,Year,Category,Votes,ValidVotes,VoteSharePercentage,PartyID
0,manoranjanbhakta,INC,0,Post Graduate,65,M,5926740.0,272061,1,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,85794,153825,55.77,3482
1,akbiswas,BSP,0,Graduate,61,M,7876500.0,65000,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,1122,153825,0.73,16651
2,asitbarandutta,IND,0,Graduate,50,M,3070000.0,0,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,523,153825,0.34,10809
3,bishnupadaray,BJP,0,Graduate,54,M,1250619.0,128710,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,55294,153825,35.95,1605
4,deepakbiswas,IND,0,Not Given,28,M,3000.0,0,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,1186,153825,0.77,10809


In [87]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20248 entries, 0 to 20247
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Candidate            20248 non-null  object 
 1   Party                20248 non-null  object 
 2   CriminalCases        20248 non-null  int64  
 3   Education            20248 non-null  object 
 4   Age                  20248 non-null  int64  
 5   Gender               20234 non-null  object 
 6   TotalAssets          19628 non-null  float64
 7   Liabilities          20248 non-null  int64  
 8   Winner               20248 non-null  int64  
 9   State                20248 non-null  object 
 10  Constituency         20248 non-null  object 
 11  Year                 20248 non-null  int64  
 12  Category             16918 non-null  object 
 13  Votes                20248 non-null  int64  
 14  ValidVotes           20248 non-null  int64  
 15  VoteSharePercentage  20248 non-null 

In [88]:
dataset['TotalAssets'].fillna(dataset.TotalAssets.mean(), inplace=True) # Fill NaN values of Candidates with 0

In [89]:
dataset.dropna(inplace =  True)

In [90]:
dataset.reset_index(inplace=True, drop=True)

In [91]:
# encode education column
encoded_edu = []
# iterate through each row in the dataset
for row in dataset.itertuples():
    education = row.Education
    
    if education == "Illiterate":
        encoded_edu.append(0)
    elif education == "Literate":
        encoded_edu.append(1)
    elif education == "5th Pass":
        encoded_edu.append(2)
    elif education == "8th Pass":
        encoded_edu.append(3)
    elif education == "10th Pass":
        encoded_edu.append(4)
    elif education == "12th Pass":
        encoded_edu.append(7)
    elif education == "Graduate":
        encoded_edu.append(8)
    elif education == "Post Graduate":
        encoded_edu.append(9)
    elif education == "Graduate Professional":
        encoded_edu.append(10)
    elif education == "Doctorate":
        encoded_edu.append(11)
    else:
        encoded_edu.append(5)

dataset['Education'] = encoded_edu


In [92]:
# change party of the less frequent parties as Other
# 'BJP','INC','IND','BSP', 'CPI(M)', 'AITC', 'MNM': high frequent
# 'TDP', 'VSRCP', 'SP', 'DMK', 'BJD': medium frequent
dataset.loc[~dataset["Party"].isin(['BJP','INC','IND','BSP', 'CPI(M)', 'AITC', 'MNM', 'TDP', 'VSRCP', 'SP', 'DMK', 'BJD']), 
            "Party"] = "Other"
dataset['Party'].value_counts()

IND       6631
Other     5733
INC       1242
BJP       1240
BSP       1129
SP         467
AITC       179
CPI(M)      88
TDP         79
DMK         73
BJD         57
Name: Party, dtype: int64

In [93]:
dataset.Category = dataset.Category.astype("str")

In [94]:
# label encode categorical columns

lblEncoder_state = LabelEncoder()
lblEncoder_state.fit(dataset['State'])
dataset['State'] = lblEncoder_state.transform(dataset['State'])

lblEncoder_cons = LabelEncoder()
lblEncoder_cons.fit(dataset['Constituency'])
dataset['Constituency'] = lblEncoder_cons.transform(dataset['Constituency'])

# lblEncoder_name = LabelEncoder()
# lblEncoder_name.fit(dataset['Candidate'])
# dataset['Candidate'] = lblEncoder_name.transform(dataset['Candidate'])

lblEncoder_party = LabelEncoder()
lblEncoder_party.fit(dataset['Party'])
dataset['Party'] = lblEncoder_party.transform(dataset['Party'])

lblEncoder_category = LabelEncoder()
lblEncoder_category.fit(dataset['Category'])
dataset['Category'] = lblEncoder_category.transform(dataset['Category'])

lblEncoder_edu = LabelEncoder()
lblEncoder_edu.fit(dataset['Education'])
dataset['Education'] = lblEncoder_edu.transform(dataset['Education'])

lblEncoder_gen = LabelEncoder()
lblEncoder_gen.fit(dataset['Gender'])
dataset['Gender'] = lblEncoder_gen.transform(dataset['Gender'])

In [95]:
dataset.drop(["Candidate", "Year", "Votes", "ValidVotes", "PartyID", "VoteSharePercentage"], inplace=True, axis=1)


In [96]:
# separate train features and label
y = dataset["Winner"]
X = dataset.drop(labels=["Winner"], axis=1)
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
solver_list = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
params = dict(solver=solver_list)
log_reg = LogisticRegression(C=1, n_jobs=-1, random_state=34)
clf = GridSearchCV(log_reg, params, cv=5)
clf.fit(X_train, y_train)
scores = clf.cv_results_['mean_test_score']

for score, solver in zip(scores, solver_list):
    print(f"Test Accuracy with {solver}: -  {score*100:.3f}" )



Test Accuracy with liblinear: -  89.138
Test Accuracy with newton-cg: -  89.138
Test Accuracy with lbfgs: -  87.823
Test Accuracy with sag: -  88.067
Test Accuracy with saga: -  88.348


In [97]:
# scaling values into 0-1 range
scaler = MinMaxScaler(feature_range=(0, 1))
features = ['State', 'Constituency', 'Party', 'CriminalCases', 'Age', 'Category', 'Education', 'TotalAssets', 
            'Liabilities']
dataset[features] = scaler.fit_transform(dataset[features])

In [123]:
# separate train features and label
y = dataset["Winner"]
y=y.astype('int')
X = dataset.drop(labels=["Winner"], axis=1)
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=34, stratify=y)
solver_list = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
params = dict(solver=solver_list)
log_reg = LogisticRegression(C=1, n_jobs=-1, random_state=34)
clf = GridSearchCV(log_reg, params, cv=5)
clf.fit(X_train, y_train)
scores = clf.cv_results_['mean_test_score']

for score, solver in zip(scores, solver_list):
    print(f"Test Accuracy with {solver}: -  {score*100:.3f}" )



Test Accuracy with liblinear: -  89.567
Test Accuracy with newton-cg: -  89.597
Test Accuracy with lbfgs: -  89.597
Test Accuracy with sag: -  89.597
Test Accuracy with saga: -  89.597


In [133]:
# apply SelectKBest class to extract top most features

bestfeatures = SelectKBest(score_func=f_classif, k='all')
fit = bestfeatures.fit(X, y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

# concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs', 'Score']
print(featureScores.nlargest(30, 'Score'))

           Specs        Score
0          Party  1909.992066
2      Education   591.316731
3            Age   539.706222
1  CriminalCases   128.268884
6    Liabilities   117.350010
9       Category    39.971436
4         Gender    27.262569
5    TotalAssets    16.627785
7          State     8.052972
8   Constituency     0.191420


In [126]:
clf.score(X, y)

0.8954959215037238

In [120]:
# get importance
importance = clf.coef_
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

AttributeError: 'GridSearchCV' object has no attribute 'coef_'

### Random Forest Classifier

In [99]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, chi2

In [100]:
dataset = pd.read_csv('Datasets/combined_all.csv')
dataset

Unnamed: 0.1,Unnamed: 0,Candidate,Party,CriminalCases,Education,Age,Gender,TotalAssets,Liabilities,Winner,State,Constituency,Year,Category,Votes,ValidVotes,VoteSharePercentage,PartyID
0,0,manoranjanbhakta,INC,0,Post Graduate,65,M,5926740.0,272061,1,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,85794,153825,55.77,3482
1,1,akbiswas,BSP,0,Graduate,61,M,7876500.0,65000,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,1122,153825,0.73,16651
2,2,asitbarandutta,IND,0,Graduate,50,M,3070000.0,0,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,523,153825,0.34,10809
3,3,bishnupadaray,BJP,0,Graduate,54,M,1250619.0,128710,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,55294,153825,35.95,1605
4,4,deepakbiswas,IND,0,Not Given,28,M,3000.0,0,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,1186,153825,0.77,10809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20243,5497,durgadashajra,IND,0,12th Pass,62,M,1325108.0,0,0,west_bengal,uluberia,2019,,6770,1311099,0.52,10809
20244,5498,joybanerjee,BJP,2,Graduate,56,M,246933.0,0,0,west_bengal,uluberia,2019,GEN,479586,1311099,36.58,1605
20245,5499,maksudakhatun,CPI(M),0,Doctorate,58,F,12578736.0,2876353,0,west_bengal,uluberia,2019,,81314,1311099,6.20,14635
20246,5500,minatisarkar,SUCI(C),0,Graduate,61,F,1091556.0,0,0,west_bengal,uluberia,2019,GEN,1697,1311099,0.13,8082


In [101]:
dataset.drop(["Unnamed: 0"],axis = 1,inplace = True)

In [102]:
dataset.head()

Unnamed: 0,Candidate,Party,CriminalCases,Education,Age,Gender,TotalAssets,Liabilities,Winner,State,Constituency,Year,Category,Votes,ValidVotes,VoteSharePercentage,PartyID
0,manoranjanbhakta,INC,0,Post Graduate,65,M,5926740.0,272061,1,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,85794,153825,55.77,3482
1,akbiswas,BSP,0,Graduate,61,M,7876500.0,65000,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,1122,153825,0.73,16651
2,asitbarandutta,IND,0,Graduate,50,M,3070000.0,0,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,523,153825,0.34,10809
3,bishnupadaray,BJP,0,Graduate,54,M,1250619.0,128710,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,55294,153825,35.95,1605
4,deepakbiswas,IND,0,Not Given,28,M,3000.0,0,0,andaman_and_nicobar_islands,andaman_and_nicobar_islands,2004,GEN,1186,153825,0.77,10809


In [103]:
dataset['TotalAssets'].fillna(dataset.TotalAssets.mean(), inplace=True) # Fill NaN values of Candidates with 0

In [104]:
dataset.dropna(inplace =  True)

In [105]:
dataset.reset_index(inplace=True, drop=True)

In [106]:
# encode education column
encoded_edu = []
# iterate through each row in the dataset
for row in dataset.itertuples():
    education = row.Education
    
    if education == "Illiterate":
        encoded_edu.append(0)
    elif education == "Literate":
        encoded_edu.append(1)
    elif education == "5th Pass":
        encoded_edu.append(2)
    elif education == "8th Pass":
        encoded_edu.append(3)
    elif education == "10th Pass":
        encoded_edu.append(4)
    elif education == "12th Pass":
        encoded_edu.append(7)
    elif education == "Graduate":
        encoded_edu.append(8)
    elif education == "Post Graduate":
        encoded_edu.append(9)
    elif education == "Graduate Professional":
        encoded_edu.append(10)
    elif education == "Doctorate":
        encoded_edu.append(11)
    else:
        encoded_edu.append(5)

dataset['Education'] = encoded_edu


In [107]:
# change party of the less frequent parties as Other
# 'BJP','INC','IND','BSP', 'CPI(M)', 'AITC', 'MNM': high frequent
# 'TDP', 'VSRCP', 'SP', 'DMK', 'BJD': medium frequent
dataset.loc[~dataset["Party"].isin(['BJP','INC','IND','BSP', 'CPI(M)', 'AITC', 'MNM', 'TDP', 'VSRCP', 'SP', 'DMK', 'BJD']), 
            "Party"] = "Other"
dataset['Party'].value_counts()

IND       6631
Other     5733
INC       1242
BJP       1240
BSP       1129
SP         467
AITC       179
CPI(M)      88
TDP         79
DMK         73
BJD         57
Name: Party, dtype: int64

In [108]:
dataset.Category = dataset.Category.astype("str")

In [109]:
# label encode categorical columns

lblEncoder_state = LabelEncoder()
lblEncoder_state.fit(dataset['State'])
dataset['State'] = lblEncoder_state.transform(dataset['State'])

lblEncoder_cons = LabelEncoder()
lblEncoder_cons.fit(dataset['Constituency'])
dataset['Constituency'] = lblEncoder_cons.transform(dataset['Constituency'])

# lblEncoder_name = LabelEncoder()
# lblEncoder_name.fit(dataset['Candidate'])
# dataset['Candidate'] = lblEncoder_name.transform(dataset['Candidate'])

lblEncoder_party = LabelEncoder()
lblEncoder_party.fit(dataset['Party'])
dataset['Party'] = lblEncoder_party.transform(dataset['Party'])

lblEncoder_category = LabelEncoder()
lblEncoder_category.fit(dataset['Category'])
dataset['Category'] = lblEncoder_category.transform(dataset['Category'])

lblEncoder_edu = LabelEncoder()
lblEncoder_edu.fit(dataset['Education'])
dataset['Education'] = lblEncoder_edu.transform(dataset['Education'])

lblEncoder_gen = LabelEncoder()
lblEncoder_gen.fit(dataset['Gender'])
dataset['Gender'] = lblEncoder_gen.transform(dataset['Gender'])

In [110]:
dataset.drop(["Candidate", "Year", "Votes", "ValidVotes", "PartyID", "VoteSharePercentage"], inplace=True, axis=1)


In [111]:
# separate train features and label
y = dataset["Winner"]
X = dataset.drop(labels=["Winner"], axis=1)
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
# train and test knn model
rfc =  RandomForestClassifier(n_estimators = 500, max_leaf_nodes = 16, n_jobs = -1)
rfc.fit(X_train, y_train)
rfc.predict(X_test)
print("Testing Accuracy is: ", rfc.score(X_test, y_test)*100, "%")

Testing Accuracy is:  90.86879432624113 %


In [112]:
# scaling values into 0-1 range
scaler = MinMaxScaler(feature_range=(0, 1))
features = ['State', 'Constituency', 'Party', 'CriminalCases', 'Age', 'Category', 'Education', 'TotalAssets', 
            'Liabilities']
dataset[features] = scaler.fit_transform(dataset[features])

In [113]:
# separate train features and label
y = dataset["Winner"]
X = dataset.drop(labels=["Winner"], axis=1)
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
# train and test knn model
rfc =  RandomForestClassifier(n_estimators = 500,max_leaf_nodes = 16,n_jobs = -1)
rfc.fit(X_train, y_train)
rfc.predict(X_test)
print("Testing Accuracy is: ", rfc.score(X_test, y_test)*100, "%")

Testing Accuracy is:  90.75059101654847 %


In [114]:
dataset.head(100)

Unnamed: 0,Party,CriminalCases,Education,Age,Gender,TotalAssets,Liabilities,Winner,State,Constituency,Category
0,0.6,0.000000,0.8,0.691489,1,7.686784e-05,0.000123,1,0.000000,0.045198,0.0
1,0.3,0.000000,0.7,0.648936,1,1.021556e-04,0.000029,0,0.000000,0.045198,0.0
2,0.7,0.000000,0.7,0.531915,1,3.981686e-05,0.000000,0,0.000000,0.045198,0.0
3,0.2,0.000000,0.7,0.574468,1,1.622009e-05,0.000058,0,0.000000,0.045198,0.0
4,0.7,0.000000,0.5,0.297872,1,3.888307e-08,0.000000,0,0.000000,0.045198,0.0
...,...,...,...,...,...,...,...,...,...,...,...
95,0.3,0.000000,0.7,0.734043,1,2.353993e-05,0.000180,0,0.026316,0.497175,0.0
96,0.7,0.000000,0.6,0.531915,1,9.337904e-07,0.000000,0,0.026316,0.497175,0.0
97,0.6,0.004167,0.8,0.521277,0,1.775706e-03,0.000135,1,0.026316,0.516949,0.0
98,0.7,0.000000,0.5,0.468085,0,2.593908e-06,0.000000,0,0.026316,0.516949,0.0
