In [None]:
import shap
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [None]:
data = pd.read_csv('../input/divorce-prediction/divorce_data.csv', delimiter=';')
reference = pd.read_csv('../input/divorce-prediction/reference.tsv', delimiter='|')

In [None]:
print(data.isnull().any().sum())
print(data.isna().any().sum())

In [None]:
dist = sns.barplot(x=data['Divorce'].unique(), y=data['Divorce'].value_counts())
dist.set(xlabel = 'Divorce', ylabel = 'Count', title = 'Distribution of Divorce categories')

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(data.corr(), cmap="YlGnBu")
plt.show()

In [None]:
train_x, test_x = train_test_split(data, test_size=0.3)
train_y = train_x.pop('Divorce')
test_y = test_x.pop('Divorce')

# RandomForestClassifier

In [None]:
forest = RandomForestClassifier()
forest.fit(train_x, train_y)

preds = forest.predict_proba(test_x)[:,1]

In [None]:
preds_bool = preds >= 0.5
accuracy = accuracy_score(test_y, preds_bool)
print('Accuracy :', accuracy)

In [None]:
f1_value = f1_score(test_y, preds_bool)
print('F1 Score :', f1_value)

In [None]:
print('Classification Report:\n',classification_report(test_y , preds_bool))

In [None]:
explainer = shap.TreeExplainer(forest)
shap_value = explainer.shap_values(data)

In [None]:
shap.summary_plot(shap_value, data, plot_type="bar", plot_size = (25,10))

# Use GridSearch

In [None]:
param_grid={
    "max_depth" : np.arange(1, 20, 1),
    "min_samples_split": np.arange(2, 20, 2),
    "min_samples_leaf": np.arange(2, 20, 2)
}

In [None]:
grid = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid,
    scoring="accuracy",
    cv=5
)
grid

In [None]:
grid.fit(train_x, train_y)

In [None]:
print(f'Best parameters {grid.best_params_}')

# DecisionTreeClassifier

In [None]:
tree = DecisionTreeClassifier()
tree.fit(train_x, train_y)

preds_tree = tree.predict_proba(test_x)[:,1]

In [None]:
tree_bool = preds_tree >= 0.5
accuracy = accuracy_score(test_y, tree_bool)
print('Accuracy :', accuracy)

In [None]:
f1_value = f1_score(test_y, tree_bool)
print('F1 Score :', f1_value)

In [None]:
print('Classification Report:\n',classification_report(test_y , tree_bool))