In [None]:
import os
import csv
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
import numpy as np
from matplotlib import pyplot as plt 
from sklearn.ensemble import RandomForestClassifier

pd.options.mode.chained_assignment = None  # default='warn'

file = os.listdir()
df = pd.read_csv('train.csv')

def preprocess(df):
	if 'Response' in df:
		X = df.drop(columns = ['Response','id','Policy_Sales_Channel','Region_Code']).replace(['Male','Female'],[1,0])
		target = df['Response']
	else:
		X = df.drop(columns = ['id','Policy_Sales_Channel','Region_Code']).replace(['Male','Female'],[1,0])
		target = np.array([])
	X['Vehicle_Age'] = X['Vehicle_Age'].replace(['> 2 Years', '1-2 Year','< 1 Year'],[2,1,0])
	X['Vehicle_Damage'] = X['Vehicle_Damage'].replace(['Yes','No'],[1,0])
	for i in range(300//150):
		# &-and, |-or
		X['Vintage'] = X['Vintage'].mask((X['Vintage']>=i*150) & (X['Vintage']<=(i+1)*150),i)
	for i in range(85//5):
		X['Age'] = X['Age'].mask((X['Age']>=i*5) & (X['Vintage']<=(i+1)*5),i)
	for i in range(600000//300000):
		X['Annual_Premium'] = X['Annual_Premium'].mask((X['Annual_Premium']>=i*300000) & (X['Vintage']<=(i+1)*300000),i)
	
	return X, target

train = preprocess(df)
X,target= train[0], train[1]

mi_scores = mutual_info_classif(X, target,discrete_features=True)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)
mi_name = list(mi_scores.index)
# print(mi_name)
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

# plt.figure(dpi=100, figsize=(8, 5))
# plot_mi_scores(mi_scores)
# plt.show()

X = X[[mi_name[0],mi_name[1],mi_name[2]]]
clf = RandomForestClassifier(n_jobs=-1, random_state=0)
clf.fit(X, target.ravel())

test = pd.read_csv('test.csv')
test_input = preprocess(test)[0]
test_input = test_input[[mi_name[0],mi_name[1],mi_name[2]]]
result = clf.predict(test_input)

result = pd.DataFrame(result)

temp = pd.read_csv('sample_submission.csv')
temp['Response'] = result
temp.to_csv(r'sample_submission.csv', index=False)