In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder

In [None]:
dataset = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

In [None]:
columns = []
columns = dataset.columns

columns

Index(['ID', 'Candidate', 'Constituency ∇', 'Party', 'Criminal Case',
       'Total Assets', 'Liabilities', 'state', 'Education'],
      dtype='object')

In [None]:
dataset['Liabilities'].unique()

array(['2 Crore+', '0', '22 Lac+', '24 Lac+', '61 Lac+', '29 Lac+',
       '35 Lac+', '10 Lac+', '1 Lac+', '15 Lac+', '73 Lac+', '5 Lac+',
       '6 Lac+', '1 Crore+', '4 Lac+', '42 Lac+', '4 Crore+', '3 Lac+',
       '18 Lac+', '17 Crore+', '39 Lac+', '26 Lac+', '8 Lac+', '3 Crore+',
       '7 Crore+', '9 Lac+', '28 Lac+', '14 Lac+', '37 Lac+', '47 Lac+',
       '6 Crore+', '23 Lac+', '90 Lac+', '7 Lac+', '53 Lac+', '75 Lac+',
       '44 Lac+', '60 Thou+', '21 Lac+', '55 Lac+', '17 Lac+', '2 Lac+',
       '49 Lac+', '91 Lac+', '11 Crore+', '16 Lac+', '89 Lac+', '12 Lac+',
       '62 Lac+', '34 Lac+', '37 Crore+', '15 Crore+', '20 Lac+',
       '70 Lac+', '61 Thou+', '32 Lac+', '50 Lac+', '48 Lac+',
       '43 Crore+', '5 Crore+', '31 Lac+', '30 Lac+', '11 Lac+',
       '80 Lac+', '46 Lac+', '60 Lac+', '9 Crore+', '9 Hund+', '30 Thou+',
       '99 Lac+', '46 Thou+', '59 Lac+', '22 Crore+', '95 Lac+',
       '13 Lac+', '96 Lac+', '27 Lac+', '36 Crore+', '19 Lac+', '57 Lac+',
       '82 

In [None]:
X_train = dataset.iloc[:, :-1]
y_train = dataset.iloc[:, -1]

In [None]:
X_train.shape

(2059, 8)

In [None]:
y_train.shape

(2059,)

In [None]:
# Define a function to handle unit conversion
def convert_unit(value):
    if value != '0':
        value = value.strip()  # Remove any leading or trailing whitespaces
        unit = value.split(' ')[-1]
        amount = float(value.replace(unit, '').strip('+'))  # Remove unit and '+' symbol

        if unit == 'Crore+':
            return amount * 10**7
        elif unit == 'Lac+':
            return amount * 10**5
        elif unit == 'Thou+':
            return amount * 10**3
        else:
            return amount  # Return the amount if no unit is found
    else:
        return 0


def clean_data(X_train):

  data = X_train.copy()

  data.drop(columns=['Candidate', 'Constituency ∇','ID'], inplace=True)
  categorical_cols = ['Party', 'state']
  for col in categorical_cols:
    data[col] = data[col].astype('category').cat.codes

  # Apply unit conversion to Liabilities and Total Assets
  data['Total Assets'] = data['Total Assets'].apply(convert_unit)
  data['Liabilities'] = data['Liabilities'].apply(convert_unit)

  print(data.shape)
  print(data.head)

  return data


In [None]:
X_data = clean_data(X_train)

(2059, 5)
<bound method NDFrame.head of       Party  Criminal Case  Total Assets  Liabilities  state
0         7              4  2.110000e+09   20000000.0     23
1         4              0  1.000000e+07          0.0     13
2         8              0  7.000000e+07    2200000.0     11
3         4              0  9.000000e+07    2400000.0      3
4         4              2  2.000000e+07    6100000.0     27
...     ...            ...           ...          ...    ...
2054      5              1  6.100000e+06    1000000.0     12
2055      8              0  2.000000e+07     800000.0     21
2056      4              0  1.300000e+08    8500000.0     25
2057     13              1  2.500000e+08    9400000.0     14
2058      4              0  1.100000e+06          0.0      1

[2059 rows x 5 columns]>


In [None]:
X_data.to_csv('X_data.csv', index=True)

In [None]:
y_train.unique()

array(['8th Pass', '12th Pass', 'Post Graduate', 'Graduate Professional',
       'Graduate', '10th Pass', 'Others', 'Doctorate', 'Literate',
       '5th Pass'], dtype=object)

In [None]:
label_mapping = {
    '8th Pass': 0,
    '12th Pass': 1,
    'Post Graduate': 2,
    'Graduate Professional': 3,
    'Graduate': 4,
    '10th Pass': 5,
    'Others': 6,
    'Doctorate': 7,
    'Literate': 8,
    '5th Pass': 9
}

# Encode y_train
encoded_y_train = y_train.map(label_mapping)

In [None]:
encoded_y_train

0       0
1       1
2       2
3       2
4       0
       ..
2054    3
2055    5
2056    4
2057    1
2058    4
Name: Education, Length: 2059, dtype: int64

In [None]:
X_data.isna().sum()

Party            0
Criminal Case    0
Total Assets     0
Liabilities      0
state            0
dtype: int64

In [None]:
# Decode y_pred
decoded_y_pred = encoded_y_train.map({v: k for k, v in label_mapping.items()})

In [None]:
decoded_y_pred

0                    8th Pass
1                   12th Pass
2               Post Graduate
3               Post Graduate
4                    8th Pass
                ...          
2054    Graduate Professional
2055                10th Pass
2056                 Graduate
2057                12th Pass
2058                 Graduate
Name: Education, Length: 2059, dtype: object

In [None]:
X_test = clean_data(test_data)

(1374, 5)
<bound method NDFrame.head of       Party  Criminal Case  Total Assets  Liabilities  state
0         9              2   700000000.0  110000000.0     14
1         2              1    20000000.0    1300000.0     27
2        13              3   490000000.0   10000000.0     14
3         2              1    20000000.0          0.0     27
4        18              0   160000000.0   20000000.0     25
...     ...            ...           ...          ...    ...
1369      4              0    10000000.0          0.0     27
1370      3              1           0.0    1300000.0     18
1371      8              1    50000000.0          0.0      3
1372      4              1    20000000.0     500000.0      9
1373      4              0    90000000.0          0.0      4

[1374 rows x 5 columns]>


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize the Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_data, encoded_y_train)

# Predict on the test data
y_pred = model.predict(X_test)

In [None]:
decode_label_mapping = {
    0: '8th Pass',
    1: '12th Pass',
    2: 'Post Graduate',
    3: 'Graduate Professional',
    4: 'Graduate',
    5: '10th Pass',
    6: 'Others',
    7: 'Doctorate',
    8: 'Literate',
    9: '5th Pass'
}

y_pred_final = [decode_label_mapping[pred] for pred in better_y_pred]

In [None]:
y_pred_final

['Graduate',
 '12th Pass',
 'Graduate',
 'Post Graduate',
 'Graduate',
 'Graduate',
 'Graduate',
 'Post Graduate',
 'Graduate',
 'Post Graduate',
 'Graduate',
 '10th Pass',
 'Graduate',
 'Graduate Professional',
 'Graduate',
 'Graduate',
 'Graduate',
 'Graduate',
 'Graduate',
 'Graduate',
 'Graduate',
 'Graduate',
 'Post Graduate',
 'Graduate',
 'Graduate',
 '12th Pass',
 'Post Graduate',
 '12th Pass',
 'Graduate',
 'Post Graduate',
 'Graduate',
 'Graduate',
 'Post Graduate',
 'Graduate',
 'Post Graduate',
 'Graduate',
 'Post Graduate',
 'Graduate',
 'Graduate',
 'Graduate',
 'Post Graduate',
 'Graduate',
 'Post Graduate',
 'Post Graduate',
 'Graduate Professional',
 'Graduate',
 'Graduate',
 'Post Graduate',
 'Post Graduate',
 'Graduate',
 'Graduate',
 'Graduate',
 'Graduate',
 'Post Graduate',
 'Post Graduate',
 'Graduate',
 'Graduate',
 'Graduate',
 'Graduate',
 'Graduate',
 'Graduate',
 'Graduate',
 'Graduate Professional',
 'Post Graduate',
 'Graduate',
 'Graduate',
 'Graduate',
 

In [None]:
submission_df = pd.DataFrame({'ID': range(len(y_pred_final)), 'Education': y_pred_final})
submission_df.to_csv('submission.csv', index=False)