# **CS253 Python Assignment**


### Name: Kantule Ritesh Ramdas
### Roll No: 210488

*   **Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import seaborn as sns

*   **Data pre-processing**



In [None]:
# Load the dataset
train_df = pd.read_csv("/kaggle/input/who-is-the-real-winner/train.csv")
test_df = pd.read_csv("/kaggle/input/who-is-the-real-winner/test.csv")

In [None]:
# 1. Convert 'Total Assets' and 'Liabilities' columns from crore to lakh
train_df['Total Assets'] = train_df['Total Assets'].apply(lambda x: int(x.split()[0]) * 100 if 'Crore' in x else int(x.split()[0]))
train_df['Liabilities'] = train_df['Liabilities'].apply(lambda x: int(x.split()[0]) * 100 if 'Crore' in x else int(x.split()[0]))

# 2. Handle missing values if any
train_df.fillna(0, inplace=True)

In [None]:
# Apply min-max normalization to 'Total Assets' and 'Liabilities' columns
scaler = MinMaxScaler()
train_df[['Total Assets', 'Liabilities', 'Criminal Case']] = scaler.fit_transform(train_df[['Total Assets', 'Liabilities', 'Criminal Case']])

In [None]:
train_df.head()

In [None]:
# Apply one-hot encoding to 'Party' and 'Education' columns
train_df = pd.get_dummies(train_df, columns=['Party', 'state'])

In [None]:
train_df.head()

In [None]:
# Check for non-numeric columns
non_numeric_columns = train_df.select_dtypes(exclude=['number']).columns
print(non_numeric_columns)

In [None]:
# Split the dataset into features (X) and target variable (y)
X = train_df.drop(columns=['Candidate', 'Education', 'Constituency ∇'])  # Excluding one dummy column to avoid multicollinearity
y = train_df[['Education']]   # Target variable

### **Model selection (RandomForest)**

In [None]:
# Initialize and train Random Forest Classifier with best parameters
model = RandomForestClassifier(n_estimators=300, max_depth=20, min_samples_split=5, min_samples_leaf=1)
model.fit(X_train, y_train)

In [None]:
# Preprocess test data
test_df['Total Assets'] = test_df['Total Assets'].apply(lambda x: int(x.split()[0]) * 100 if 'Crore' in x else int(x.split()[0]))
test_df['Liabilities'] = test_df['Liabilities'].apply(lambda x: int(x.split()[0]) * 100 if 'Crore' in x else int(x.split()[0]))
test_df.fillna(0, inplace=True)
test_df[['Total Assets', 'Liabilities', 'Criminal Case']] = scaler.transform(test_df[['Total Assets', 'Liabilities', 'Criminal Case']])
test_df = pd.get_dummies(test_df, columns=['Party', 'state'])

In [None]:
# Split test data into features (X_test)
X_test = test_df.drop(columns=['Candidate', 'Constituency ∇'])  # Exclude columns not used in training

# Predict Education using the trained model
y_pred_test = model.predict(X_test)

* **Output as submission.csv**

In [None]:
# Create DataFrame with ID and predicted Education
output_df = pd.DataFrame({'ID': test_df['ID'], 'Education': y_pred_test})

# Save DataFrame to CSV
output_df.to_csv('/kaggle/working/submission.csv', index=False)

## **Data Visualization**

In [None]:
# Load the dataset
train_df = pd.read_csv("/kaggle/input/who-is-the-real-winner/train.csv")
# test_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Python Assignment data/test.csv")

In [None]:
# 1. Convert 'Total Assets' and 'Liabilities' columns from crore to lakh
train_df['Total Assets'] = train_df['Total Assets'].apply(lambda x: int(x.split()[0]) * 100 if 'Crore' in x else int(x.split()[0]))
train_df['Liabilities'] = train_df['Liabilities'].apply(lambda x: int(x.split()[0]) * 100 if 'Crore' in x else int(x.split()[0]))

# 2. Handle missing values if any
train_df.fillna(0, inplace=True)



*   **Data preprocessing for criminal records**



In [None]:
# Group parties by the sum of criminal records
criminal_records = train_df.groupby('Party')['Criminal Case'].sum()

# Sort parties by the sum of criminal records in ascending order
sorted_criminal_records = criminal_records.sort_values()

# Filter parties with less than 1% share
other_criminal_records = sorted_criminal_records[sorted_criminal_records < sorted_criminal_records.sum() * 0.01]

# Replace them with a single entry "Others"
sorted_criminal_records = sorted_criminal_records.drop(other_criminal_records.index)
sorted_criminal_records['Others'] = other_criminal_records.sum()



*   **Data preprocessing for wealth**



In [None]:
# Group parties by the mean of total assets
party_wealth = train_df.groupby('Party')['Total Assets'].mean()

# Sort parties by the mean of total assets in ascending order
sorted_wealth = party_wealth.sort_values()

# Filter parties with less than 1% share
other_wealth = sorted_wealth[sorted_wealth < sorted_wealth.sum() * 0.01]

# Replace them with a single entry "Others"
sorted_wealth = sorted_wealth.drop(other_wealth.index)
sorted_wealth['Others'] = other_wealth.sum()

In [None]:
# Define the custom order for education levels
education_order = ['Others', 'Literate', '5th Pass', '8th Pass', '10th Pass', '12th Pass', 'Graduate', 'Graduate Professional', 'Post Graduate', 'Doctorate']

In [None]:
train_df.head()

### **Percentage distribution of parties with candidates having the most criminal records**

In [None]:
# Data visualization for criminal records (Pie chart)
plt.figure(figsize=(10, 6))
plt.pie(sorted_criminal_records.values, labels=sorted_criminal_records.index, autopct='%1.1f%%', startangle=140)
plt.title('Percentage distribution of parties with candidates having the most criminal records')
plt.axis('equal')
plt.show()

### **Percentage distribution of parties with the most wealthy candidates**

In [None]:
# Data visualization for wealth (Pie chart)
plt.figure(figsize=(10, 6))
plt.pie(sorted_wealth.values, labels=sorted_wealth.index, autopct='%1.1f%%', startangle=140)
plt.title('Percentage distribution of parties with the most wealthy candidates')
plt.axis('equal')
plt.show()

### **Criminal vs Total Assets**


*   **Correlation between Criminal Cases and Total Assets.**
*   **Average of total assets to get more information from data.**



In [None]:
# Calculate the correlation between 'Criminal Case' and 'Total Assets'
correlation = train_df['Criminal Case'].corr(np.log1p(train_df['Total Assets']))

# Print the correlation coefficient
print("Correlation between Criminal Cases and Total Assets:", correlation)

# Calculate the average total assets
average_total_assets = train_df['Total Assets'].mean()

# Create a scatter plot with party as the legend
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Criminal Case', y='Total Assets', hue='Party', data=train_df, palette='bright', alpha=0.7)

# Plot the average total assets
plt.axhline(average_total_assets, color='red', linestyle='--', label=f'Average Total Assets: {average_total_assets:.2f}')
plt.title('Criminal Cases vs. Total Assets')
plt.xlabel('Criminal Cases')
plt.ylabel('Total Assets (Log Scale)')

# Apply logarithmic scale to y-axis (Total Assets)
plt.yscale('log')

# Add a regression line
sns.regplot(x='Criminal Case', y=np.log1p(train_df['Total Assets']), data=train_df, scatter=False, color='black', line_kws={'linewidth':2})

# Annotate the correlation line
plt.annotate('Correlation Line', xy=(0.6, 0.1), xycoords='axes fraction', fontsize=12, color='black')

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')  # Move the legend outside the plot
plt.show()

### Criminal vs. State (for each party)

In [None]:
plt.figure(figsize=(18, 8))  # Adjust the figure size as needed
sns.stripplot(x='state', y='Criminal Case', hue='Party', data=train_df, jitter=True, dodge=True, size=5)
plt.title('Criminal Cases vs. State for Each Party')
plt.xlabel('State')
plt.ylabel('Criminal Cases')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels and align them to the right
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')  # Move the legend outside the plot
plt.tight_layout()
plt.show()


### **Distribution of Criminal Cases vs. State for Each Party**

In [None]:
plt.figure(figsize=(18, 8))  # Adjust the figure size as needed

# Define a custom color palette with distinct colors for each party
custom_palette = sns.color_palette("tab20", n_colors=len(train_df['Party'].unique()))

# Create a histogram with different colors for each party
for i, party in enumerate(train_df['Party'].unique()):
    sns.histplot(data=train_df[train_df['Party'] == party], y='state', color=custom_palette[i], element='step', fill=True, binwidth=0.5, label=party, edgecolor='black')

plt.title('Distribution of Criminal Cases vs. State for Each Party')
plt.xlabel('Frequency')
plt.ylabel('State (log scale)')
plt.xscale('log')  # Set x-axis to log scale
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels and align them to the right
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')  # Move the legend outside the plot
plt.tight_layout()
plt.show()


### **Criminal Cases vs. State (for Each Party)**

In [None]:
plt.figure(figsize=(18, 8))  # Adjust the figure size as needed

# Define a custom colormap with cool to warm tones
custom_palette = sns.diverging_palette(240, 10, as_cmap=True)

# Create a dot plot
sns.stripplot(x='Party', y='state', hue='Criminal Case', data=train_df, palette=custom_palette, jitter=True, dodge=True, alpha=0.7, size=10)
plt.title('Criminal Cases vs. State for Each Party')
plt.xlabel('Party')
plt.ylabel('State')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels and align them to the right
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')  # Move the legend outside the plot
plt.tight_layout()
plt.show()


In [None]:
# Plot party vs education with the correct order of 'Education'
plt.figure(figsize=(12, 8))
sns.stripplot(x='Education', y='Party', data=train_df, jitter=True, order=education_order)
plt.title('Education vs Party')
plt.xlabel('Education')
plt.ylabel('Party')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Plot state vs education
plt.figure(figsize=(12, 8))
sns.stripplot(x='Education', y='state', data=train_df, jitter=True, order = education_order)
plt.title('Education vs State')
plt.xlabel('Education')
plt.ylabel('State')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Plot education vs criminal case with custom order
plt.figure(figsize=(12, 6))
sns.stripplot(x='Education', y='Criminal Case', data=train_df, jitter=True, order=education_order)
plt.title('Education vs Criminal Case')
plt.xlabel('Education')
plt.ylabel('Criminal Case')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Create a pivot table to count the number of candidates in each category of 'Education'
education_counts = train_df.pivot_table(index='Education', columns='Party', aggfunc='size', fill_value=0)

# Define the custom order for education levels
education_order = ['Others', 'Literate', '5th Pass', '8th Pass', '10th Pass', '12th Pass', 'Graduate', 'Graduate Professional', 'Post Graduate', 'Doctorate']

# Plot the heatmap
plt.figure(figsize=(12, 10))  # Adjust the figure size as needed
sns.heatmap(education_counts, cmap='viridis', annot=True, fmt='d', linewidths=0.5, linecolor='gray', cbar_kws={'label': 'Number of Candidates'})
plt.title('Number of Candidates by Party and Education')
plt.xlabel('Party')
plt.ylabel('Education')
plt.yticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Create a pivot table to count the number of candidates in each category of 'Education' for each state
state_education_counts = train_df.pivot_table(index='Education', columns='state', aggfunc='size', fill_value=0)

# Plot the heatmap for State vs Education
plt.figure(figsize=(12, 10))  # Adjust the figure size as needed
sns.heatmap(state_education_counts, cmap='viridis', annot=True, fmt='d', linewidths=0.5, linecolor='gray', cbar_kws={'label': 'Number of Candidates'})
plt.title('Number of Candidates by State and Education')
plt.xlabel('State')
plt.ylabel('Education')
plt.yticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Create a pivot table to count the number of candidates in each category of 'Education' for each criminal case count
criminal_case_education_counts = train_df.pivot_table(index='Education', columns='Criminal Case', aggfunc='size', fill_value=0)

# Plot the heatmap for Criminal Case vs Education
plt.figure(figsize=(12, 10))  # Adjust the figure size as needed
sns.heatmap(criminal_case_education_counts, cmap='viridis', annot=True, fmt='d', linewidths=0.5, linecolor='gray', cbar_kws={'label': 'Number of Candidates'})
plt.title('Number of Candidates by Criminal Case and Education')
plt.xlabel('Criminal Case')
plt.ylabel('Education')
plt.yticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()