# Dependencies

In [None]:
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows()
import matplotlib.pyplot as plt
from xgboost import XGBClassifier

In [None]:
df = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")

In [None]:
df

columns f1 to f118 are features that affect the the probability that a person will claim the insurance policy our task is to find out the relation between the features and the claim column

In [None]:
df.describe()

In [None]:
df["f1"].isnull().sum()
5* 957919 / 100

First thing we'll do is remove the NAN values. Since all the features are numerical I will fill the Nan rows with the mean of the specific column. If a column contains more NaN values than 50% of the number of rows it will be removed. ( 5% of 957919 = 47895.95)

In [None]:
column_names = []
number_of_Nan_values = []
for x in df.columns:
    column_names.append(x)
    number_of_Nan_values.append(df[x].isnull().sum())

null_info = pd.DataFrame({
    "column_names" : column_names,
    "number_of_NAN_values" : number_of_Nan_values
})
display(null_info)
display(null_info[null_info["number_of_NAN_values"] < 47895.95])

Since all the collumns have less than 47895.95 null values we will not need to drop any of them.

In [None]:
for x in df.columns:
    df[x].fillna(df[x].mean() , inplace = True)


column_names = []
number_of_Nan_values = []
for x in df.columns:
    column_names.append(x)
    number_of_Nan_values.append(df[x].isnull().sum())

null_info = pd.DataFrame({
    "column_names" : column_names,
    "number_of_NAN_values" : number_of_Nan_values
})
display(null_info)
display(null_info[null_info["number_of_NAN_values"] < 47895.95])

Now that we've removed 

In [None]:
data = {
    "claimed" : len(df[df["claim"] == 1]),
    "unclaimed" : len(df[df["claim"] == 0])
}
status = list(data.keys())
count = list(data.values())
fig = plt.figure(figsize = (10, 5))
plt.bar(status, count, color ='maroon',
        width = 0.1)
 
plt.xlabel("status")
plt.ylabel("No.")
plt.title("Class diffference")
plt.show()

Since the two classes are the more or less the same we won't have to worry about class Imbalance

Next we'll attempt to train a model on the dataset the models we'll be using will be Logistic Regression , Random Forests and XGBoost we'll measure the models accuracy and auc score

In [None]:
from sklearn.metrics import accuracy_score , roc_auc_score
from sklearn.model_selection import train_test_split
cols = list(df.columns)
cols.pop(-1)
cols.pop(0)
X = df[cols]
y = df["claim"]

train_x , test_x , train_y , test_y = train_test_split(X,y,test_size = 0.33 , stratify = df["claim"] , random_state = 42)

In [None]:
def score(prediction , probability , true_value):
    roc_score = roc_auc_score(true_value , probability)
    accuracy = accuracy_score(true_value , prediction)
    print("roc_auc_score :" , roc_score)
    print("accuracy :", accuracy)

### Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(train_x,train_y)
prediction = LR.predict(test_x)
probability = LR.predict_proba(test_x)
score(prediction , probability[: , -1] , test_y)

### Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(max_depth= 10)
RF.fit(train_x,train_y)
prediction = RF.predict(test_x)
probability = RF.predict_proba(test_x)
score(prediction , probability[: , 0] , test_y)

### XGBoost Classifier

In [None]:
model = XGBClassifier(label_encoder = False ,  eval_metric = roc_auc_score , learning_rate = 0.1)
model.fit(train_x,train_y)
prediction = model.predict(test_x)
probability = model.predict_proba(test_x)
score(prediction , probability[: , 0] , test_y)

Since xgboost has the best score we'll train it on the entire train dataset and predict the test dataset

In [None]:
test_df = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")
for x in test_df.columns:
    test_df[x].fillna(test_df[x].mean() , inplace = True)

In [None]:
best = XGBClassifier(label_encoder = False ,  eval_metric = roc_auc_score , learning_rate = 0.1)
best.fit(X,y)
predictions = best.predict_proba(test_df[cols])[:,0]
data = {
    "id" : list(test_df["id"]),
    "claim" : list(predictions)
}
submission = pd.DataFrame(data)

submission.to_csv("submission.csv", index = False)