In [154]:
import numpy as np 
import pandas as pd
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [155]:
train['SC'] = (train['Constituency ∇'].str.contains('SC')).astype(int)
train['ST'] = (train['Constituency ∇'].str.contains('ST')).astype(int)
test['SC'] = (test['Constituency ∇'].str.contains('SC')).astype(int)
test['ST'] = (test['Constituency ∇'].str.contains('ST')).astype(int)

train['Doctor'] = (train['Candidate'].str.contains('Dr.')).astype(int)
train['Advocate'] = (train['Candidate'].str.contains('Adv.')).astype(int)
test['Doctor'] = (test['Candidate'].str.contains('Dr.')).astype(int)
test['Advocate'] = (test['Candidate'].str.contains('Adv.')).astype(int)

train.drop('Constituency ∇', axis=1, inplace=True)
train.drop('Candidate', axis=1 , inplace=True)
test.drop('Constituency ∇', axis=1, inplace=True)
test.drop('Candidate', axis=1 , inplace=True)

In [156]:
import re

def convert(val):
    val = val.replace('+', '').replace(' ', '')
    
    conversion_factors = {'Crore': 10000000, 'Lac': 100000, 'Thou': 1000}
    
    for unit, factor in conversion_factors.items():
        if unit in val:
            value = float(re.findall(r'\d+', val)[0])
            return value * factor
    return 0

train['Total Assets'] = train['Total Assets'].apply(convert)
train['Liabilities'] = train['Liabilities'].apply(convert)
test['Total Assets'] = test['Total Assets'].apply(convert)
test['Liabilities'] = test['Liabilities'].apply(convert)

In [157]:
party_encoded = pd.get_dummies(train['Party'], prefix='Party')
train = pd.concat([train, party_encoded], axis=1)
train.drop('Party', axis=1, inplace=True)

party_encoded = pd.get_dummies(test['Party'], prefix='Party')
test = pd.concat([test, party_encoded], axis=1)
test.drop('Party', axis=1, inplace=True)

In [158]:
state_encoded = pd.get_dummies(train['state'], prefix='state')
train = pd.concat([train, state_encoded], axis=1)
train.drop('state', axis=1, inplace=True)

state_encoded = pd.get_dummies(test['state'], prefix='state')
test = pd.concat([test, state_encoded], axis=1)
test.drop('state', axis=1, inplace=True)

In [159]:
X_train = train.drop(['Education'], axis=1)
Y_train = train['Education']

In [160]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier

params = {
    'n_estimators': randint(10, 1000),
    'random_state': [42,123,456,789,1000]
}   

rand = RandomizedSearchCV(RandomForestClassifier(), param_distributions = params, n_iter = 10, cv = 5)
rand.fit(X_train, Y_train)
bestps = rand.best_params_

In [161]:
rf = RandomForestClassifier(n_estimators = bestps['n_estimators'], random_state = bestps['random_state'])

rf.fit(X_train, Y_train)
pred = rf.predict(test)

In [162]:
submission = pd.DataFrame({'ID': range(1, len(pred) + 1), 'Education': pred})
submission['ID'] = submission['ID'] - 1
submission.to_csv('submission.csv', index=False)