In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/indian-politics-2004-2019/IndiaVotes_PC__All_States_2009.csv
/kaggle/input/indian-politics-2004-2019/IndiaVotes_PC__All_States_2014.csv
/kaggle/input/indian-politics-2004-2019/2024.csv
/kaggle/input/indian-politics-2004-2019/2019.csv
/kaggle/input/indian-politics-2004-2019/cleaned.csv
/kaggle/input/indian-politics-2004-2019/2004.csv


In [2]:
import catboost
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('/kaggle/input/indian-politics-2004-2019/cleaned.csv')

In [4]:
df.rename(columns={'Unnamed: 0': 'PC'}, inplace=True)

# 2019

In [5]:
original_classes = sorted(df['Party_2019'].unique())

label_mapping = {original_classes[i]: i for i in range(len(original_classes))}

df['Party_2019_mapped'] = df['Party_2019'].map(label_mapping)

print("Original classes:", original_classes)
print("Label mapping:", label_mapping)
print("New class labels:", df['Party_2019_mapped'].unique())

Original classes: [0, 1, 2, 3, 4, 6, 7, 8, 12, 15, 17, 18, 19, 21, 22, 23, 26, 32, 34, 39, 41, 42, 46, 47, 49, 51, 52, 53, 54, 55, 56, 57]
Label mapping: {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 6: 5, 7: 6, 8: 7, 12: 8, 15: 9, 17: 10, 18: 11, 19: 12, 21: 13, 22: 14, 23: 15, 26: 16, 32: 17, 34: 18, 39: 19, 41: 20, 42: 21, 46: 22, 47: 23, 49: 24, 51: 25, 52: 26, 53: 27, 54: 28, 55: 29, 56: 30, 57: 31}
New class labels: [ 2  6  4  3 12 17  8  1  7  9  5 18 25 20 26 15 11 27  0 19 28 16 29 10
 30 22 23 13 21 31 24 14]


In [6]:
features = [
    'State_2004', 'Electors_2004', 'Votes_2004', 'Turnout_2004', 'Margin_2004', 'Margin %_2004',
    'State_2009', 'Electors_2009', 'Votes_2009', 'Turnout_2009', 'Margin_2009', 'Margin %_2009',
    'State_2014', 'Electors_2014', 'Votes_2014', 'Turnout_2014', 'Margin_2014', 'Margin %_2014'
]
target = 'Party_2019_mapped'

In [7]:
X = df[features]
y = df[target]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
catboost_model = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.05, loss_function='MultiClass', cat_features=[])

In [10]:
catboost_model.fit(X_train, y_train)
y_pred = catboost_model.predict(X_test)

0:	learn: 3.3493875	total: 140ms	remaining: 2m 20s
1:	learn: 3.1834697	total: 211ms	remaining: 1m 45s
2:	learn: 3.0796068	total: 262ms	remaining: 1m 27s
3:	learn: 2.9812097	total: 309ms	remaining: 1m 16s
4:	learn: 2.8863911	total: 350ms	remaining: 1m 9s
5:	learn: 2.8154024	total: 397ms	remaining: 1m 5s
6:	learn: 2.7052855	total: 453ms	remaining: 1m 4s
7:	learn: 2.6284590	total: 498ms	remaining: 1m 1s
8:	learn: 2.5452895	total: 539ms	remaining: 59.3s
9:	learn: 2.4941530	total: 580ms	remaining: 57.5s
10:	learn: 2.4221276	total: 623ms	remaining: 56s
11:	learn: 2.3543243	total: 663ms	remaining: 54.6s
12:	learn: 2.3035596	total: 704ms	remaining: 53.5s
13:	learn: 2.2410707	total: 745ms	remaining: 52.5s
14:	learn: 2.1991090	total: 786ms	remaining: 51.6s
15:	learn: 2.1624903	total: 826ms	remaining: 50.8s
16:	learn: 2.1220381	total: 866ms	remaining: 50.1s
17:	learn: 2.0830280	total: 906ms	remaining: 49.5s
18:	learn: 2.0388087	total: 947ms	remaining: 48.9s
19:	learn: 2.0051351	total: 987ms	remai

In [11]:
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 0.6753246753246753

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           2       0.74      0.94      0.83        52
           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00         4
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           8       0.50      0.50      0.50         2
           9       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         3
          13       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         1
          17       1.00      0.50      0.67         4
          23       0.00      0.00      0.00         1
          25       0.00      0.00      0.00         1

    accuracy                           0.68        77
   macro avg       0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
y_pred_flat = y_pred.flatten()

party_seat_counts = pd.Series(y_pred_flat).value_counts().sort_index()

print("\nTotal predicted number of seats by each party:")
for party, seats in party_seat_counts.items():
    print(f"Party {party}: {seats} seats")



Total predicted number of seats by each party:
Party 2: 66 seats
Party 4: 7 seats
Party 8: 2 seats
Party 17: 2 seats


# 2024

In [13]:
original_classes_2024 = sorted(df['Party_2024'].unique())

label_mapping_2024 = {original_classes_2024[i]: i for i in range(len(original_classes_2024))}

df['Party_2024_mapped'] = df['Party_2024'].map(label_mapping_2024)

print("Original classes for 2024:", original_classes_2024)
print("Label mapping for 2024:", label_mapping_2024)
print("New class labels for 2024:", df['Party_2024_mapped'].unique())

Original classes for 2024: [1, 2, 4, 6, 7, 8, 11, 13, 17, 18, 21, 22, 25, 29, 32, 34, 42, 47, 51, 53, 54, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69]
Label mapping for 2024: {1: 0, 2: 1, 4: 2, 6: 3, 7: 4, 8: 5, 11: 6, 13: 7, 17: 8, 18: 9, 21: 10, 22: 11, 25: 12, 29: 13, 32: 14, 34: 15, 42: 16, 47: 17, 51: 18, 53: 19, 54: 20, 56: 21, 57: 22, 58: 23, 59: 24, 60: 25, 61: 26, 62: 27, 63: 28, 64: 29, 65: 30, 66: 31, 67: 32, 68: 33, 69: 34}
New class labels for 2024: [ 1  2  0 23 14 24  7  5 25 26 27  3  6 15 18 10  4 28 29 30 31 16 19 32
  9 12 20 33  8 21 17 34 22 11 13]


In [14]:
features_2024 = [
    'State_2004', 'Electors_2004', 'Votes_2004', 'Turnout_2004', 'Margin_2004', 'Margin %_2004',
    'State_2009', 'Electors_2009', 'Votes_2009', 'Turnout_2009', 'Margin_2009', 'Margin %_2009',
    'State_2014', 'Electors_2014', 'Votes_2014', 'Turnout_2014', 'Margin_2014', 'Margin %_2014',
    'State_2019', 'Electors_2019', 'Votes_2019', 'Turnout_2019', 'Margin_2019', 'Margin %_2019'
]

target_2024 = 'Party_2024_mapped'

X = df[features_2024]
y = df[target_2024]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
catboost_model = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.05, loss_function='MultiClass', cat_features=[])

In [17]:
catboost_model.fit(X_train, y_train)
y_pred = catboost_model.predict(X_test)

0:	learn: 3.4372393	total: 58ms	remaining: 58s
1:	learn: 3.3232919	total: 114ms	remaining: 57s
2:	learn: 3.2079460	total: 172ms	remaining: 57s
3:	learn: 3.1158735	total: 229ms	remaining: 57.1s
4:	learn: 3.0336401	total: 287ms	remaining: 57.1s
5:	learn: 2.9603087	total: 344ms	remaining: 57s
6:	learn: 2.8980297	total: 401ms	remaining: 56.8s
7:	learn: 2.8278043	total: 458ms	remaining: 56.8s
8:	learn: 2.7599116	total: 517ms	remaining: 56.9s
9:	learn: 2.7025084	total: 574ms	remaining: 56.8s
10:	learn: 2.6511964	total: 630ms	remaining: 56.7s
11:	learn: 2.6006636	total: 687ms	remaining: 56.5s
12:	learn: 2.5534016	total: 745ms	remaining: 56.6s
13:	learn: 2.5062591	total: 803ms	remaining: 56.5s
14:	learn: 2.4416462	total: 860ms	remaining: 56.5s
15:	learn: 2.3948889	total: 917ms	remaining: 56.4s
16:	learn: 2.3595027	total: 976ms	remaining: 56.5s
17:	learn: 2.3231842	total: 1.03s	remaining: 56.4s
18:	learn: 2.2694818	total: 1.09s	remaining: 56.3s
19:	learn: 2.2337191	total: 1.15s	remaining: 56.2s

In [18]:
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 0.5714285714285714

Classification Report:
               precision    recall  f1-score   support

           0       0.27      1.00      0.43         3
           1       0.70      0.74      0.72        42
           2       0.25      0.27      0.26        15
           4       0.00      0.00      0.00         1
           5       1.00      1.00      1.00         1
           6       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          14       1.00      1.00      1.00         5
          17       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         1
          23       0.00      0.00      0.00         2
          26       0.00      0.00      0.00         1
          31       0.00      0.00      0.00         2
          32       0.00      0.00      0.00         1

    accuracy                           0.57        77
   macro avg       0.23      0.29      0.24        77
weighted avg       0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
y_pred_flat = y_pred.flatten()

party_seat_counts = pd.Series(y_pred_flat).value_counts().sort_index()

print("\nTotal predicted number of seats by each party:")
for party, seats in party_seat_counts.items():
    print(f"Party {party}: {seats} seats")


Total predicted number of seats by each party:
Party 0: 11 seats
Party 1: 44 seats
Party 2: 16 seats
Party 5: 1 seats
Party 14: 5 seats
