# Part 0
Firstly let's read data and do some manipulations before work.

In [None]:
import numpy as np
import pandas as pd

In [None]:
data = pd.read_csv("../input/health-insurance-cross-sell-prediction/train.csv").dropna()
testData = pd.read_csv("../input/health-insurance-cross-sell-prediction/test.csv")

In [None]:
predictionParameter = list(set(data.columns)-set(testData.columns))[0]
print( predictionParameter )

# Part 1
Before building a prediction model let's preprocess all data.

In [None]:
data.head()

In [None]:
testData.head()

In [None]:
testData.info()

In [None]:
genders = list(data["Gender"].unique())
VAs = list(data["Vehicle_Age"].unique())
VDs = list(data["Vehicle_Damage"].unique())

In [None]:
for gender in genders:
    testData["Gender"][ testData["Gender"]==gender ] = genders.index(gender)
for VA in VAs:
    testData["Vehicle_Age"][ testData["Vehicle_Age"]==VA ] = VAs.index(VA)
for VD in VDs:
    testData["Vehicle_Damage"][ testData["Vehicle_Damage"]==VD ] = VDs.index(VD)

for gender in genders:
    data["Gender"][ data["Gender"]==gender ] = genders.index(gender)
for VA in VAs:
    data["Vehicle_Age"][ data["Vehicle_Age"]==VA ] = VAs.index(VA)
for VD in VDs:
    data["Vehicle_Damage"][ data["Vehicle_Damage"]==VD ] = VDs.index(VD)

In [None]:
data.head()

In [None]:
testData.head()

In [None]:
data.astype("float")
testData.astype("float")

In [None]:
X = data.drop("Response", axis=1)
y = data["Response"]

In [None]:
XMeans = X.mean()
XSTDs = X.std()

In [None]:
testDataMeans = testData.mean()
testDataSTDs = testData.std()

In [None]:
X = (X-XMeans)/XSTDs
X.head()

In [None]:
testData = (testData-testDataMeans)/testDataSTDs
testData.head()

# Part 2
Model selection

Data is processed. Let's build some prediction models to solve a task. We will build and compare logistic regression model and precision tree model.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn import metrics
from sklearn.model_selection import train_test_split

At the beginning let's split training data on test and train data.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

Firstly, let's build and test logistic regression model.

In [None]:
logRegModel = LogisticRegression().fit(X_train, y_train)

In [None]:
print("Accuracy")
print( metrics.accuracy_score(y_test, logRegModel.predict(X_test)) )

In [None]:
print("ROC AUC")
print( metrics.roc_auc_score(y_test, logRegModel.predict(X_test)) )

Now let's build decision tree.

In [None]:
treeModel = tree.DecisionTreeClassifier().fit(X_train, y_train)

In [None]:
print("Accuracy")
print( metrics.accuracy_score(y_test, treeModel.predict(X_test)) )

In [None]:
print("ROC AUC")
print( metrics.roc_auc_score(y_test, treeModel.predict(X_test)) )

# Part 3
Prediction

In [None]:
prediction = pd.DataFrame(index=range(len(testData)))

prediction["id"] = pd.read_csv("../input/health-insurance-cross-sell-prediction/test.csv")["id"]
prediction["Response"] = treeModel.predict(testData)
prediction.head()

In [None]:
prediction.to_csv("prediction.csv")