# Insurance Fraud Analysis

# Importing Libraries

In [None]:
# Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
import warnings
warnings.filterwarnings("ignore")

# Reading Dataset

In [None]:
insurance_data = pd.read_csv("../dataset/Insurance Dataset.csv")
insurance_data.head()

# EDA

In [None]:
insurance_data.info()

In [None]:
# Keeping a copy of the original data set, we might need later
insurance_data_copy = insurance_data.copy()

In [None]:
insurance_data["Weight_baby"].unique()

## Feature Engineering: Drop useless column

In [None]:
cat = ["Area_Service", "Hospital County", "Age", "Gender", "Cultural_group", "ethnicity", "Admission_type", "Home or self care,", "apr_drg_description", "Surg_Description", "Abortion", "Emergency dept_yes/No"]

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

In [None]:
for column in cat:
    label_encoder.fit(insurance_data[column])
    insurance_data[column] = label_encoder.transform(insurance_data[column])

In [None]:
insurance_data["Days_spend_hsptl"].replace(["120 +"], "120", inplace = True)
insurance_data["Days_spend_hsptl"] = pd.to_numeric(insurance_data["Days_spend_hsptl"])
insurance_data["Days_spend_hsptl"].unique()

In [None]:
insurance_data.info()

In [None]:
insurance_data.dropna(inplace = True)

In [None]:
insurance_data["Mortality risk"] = insurance_data["Mortality risk"].astype(int)
insurance_data["Hospital Id"] = insurance_data["Hospital Id"].astype(int)

In [None]:
insurance_data.info()

In [None]:
X = insurance_data.drop(columns = ["Result"])
y = insurance_data["Result"]

In [None]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.25)

In [None]:
from sklearn.feature_selection import chi2
f_p_values = chi2(Xtrain, ytrain)

In [None]:
f_values = pd.Series(f_p_values[0])
p_values = pd.Series(f_p_values[1])

In [None]:
p_values.index = Xtrain.columns
f_values.index = Xtrain.columns

In [None]:
f_values_d = pd.DataFrame({'Columns': f_values.index, "f_value": f_p_values[0]})


In [None]:
f_values_d = f_values_d.sort_values(by = "f_value", ascending = False)
f_values_d.reset_index(inplace = True)
f_values_d.drop(columns = ["index"], inplace = True)
f_values_d

In [None]:
p_values_d = pd.DataFrame({'Columns': p_values.index, "p_value": f_p_values[1]})

In [None]:
p_values_d = p_values_d.sort_values(by = "p_value", ascending = True)
p_values_d.reset_index(inplace = True)
p_values_d.drop(columns = ["index"], inplace = True)
p_values_d

In [None]:
p_values_d[p_values_d["p_value"] < .8]

In [None]:
insurance_data.columns

In [None]:
imp_columns = ["Hospital County", "Hospital_Id", "Cultural_group", "ethnicity", "Days_spend_hsptl", "Admission_type", "Home or self care,", "ccs_procedure_code", "apr_drg_description", "Code_illness", "Mortality risk", "Surg_Description", "Weight_baby", "Emergency dept_yes/No", "Tot_charg", "Tot_cost", "Result", "Payment Typology"]

## Feture Engineering 3RD

In [None]:
imp_columns2 = ["Area_Service", "Hospital_Id", "Age", "Gender", "Days_spend_hsptl", "Admission_type", "Home or self care,", "ccs_diagnosis_code" "ccs_procedure_code", "apr_drg_description", "Code_illness","Weight_baby", "Mortality risk", "Surg_Description", "Emergency dept_yes/No", "Tot_charg", "Tot_cost", "Result"]

In [None]:
len(imp_columns2)

In [None]:
df = insurance_data[['Area_Service', 'Age', 'Gender', 'Cultural_group', 'ethnicity', 'Days_spend_hsptl',
        'Admission_type', 'Home or self care,', 'ccs_diagnosis_code',
        'ccs_procedure_code', 'Code_illness', "apr_drg_description", "Hospital County",
       'Mortality risk', 'Surg_Description', 'Emergency dept_yes/No',
       'Tot_charg', 'Tot_cost', 'Payment_Typology', "Result"]]

In [None]:
df.shape

In [None]:
df.dropna(inplace = True)
df.drop_duplicates(inplace = True)

In [None]:
df.shape

In [None]:
sns.countplot(df["Result"])

## Random Over Sampling

In [None]:
X = df.drop(["Result"], axis = 1)
y = df["Result"]

In [None]:
from imblearn.over_sampling import RandomOverSampler
random_over_sampling =  RandomOverSampler(sampling_strategy='minority')
X_ros, y_ros = random_over_sampling.fit_resample(X, y)

In [None]:
from collections import Counter

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_ros))

In [None]:
from sklearn.model_selection import train_test_split

Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_ros, y_ros, test_size = 0.2)

In [None]:
Xr_train.shape

In [None]:
# applying decision tree classification
from sklearn.tree import  DecisionTreeClassifier
from sklearn import tree

dt = DecisionTreeClassifier(criterion='entropy',random_state=50)
dt.fit(Xr_train, yr_train)

In [None]:
dt_pred = dt.predict(Xr_test)
confusion_matrix(dt_pred, yr_test)

In [None]:
accuracy_score(dt_pred, yr_test)

## Outliers Handling

In [None]:
# This function will handle outliers by imputing new value(median) or use capping
def handle_outliers(data, method):
    new_values = []
    
    Q1, Q3 = data.quantile([0.25, 0.75])
    IQR = Q3 - Q1

    lower_limit = Q1 - (1.5 * IQR)
    upper_limit = Q3 + (1.5 * IQR)
    
    if method == "imputation":
        median = data.median()
        
        for element in data:
            
            if (element < lower_limit) or (element > upper_limit):
                new_values.append(median)
                
            else:
                new_values.append(element)
    
    elif method == "capping":
        q1, q3 = data.quantile([0.25, 0.75])
        
        for element in data:
            
            if (element < lower_limit):
                new_values.append(q1)
                
            elif (element > upper_limit):
                new_values.append(q3)
                
            else:
                new_values.append(element)
                
    return new_values

In [None]:
df["Tot_cost"] = handle_outliers(insurance_data["Tot_cost"], "capping")
df["Tot_charg"] = handle_outliers(insurance_data["Tot_charg"], "capping")

In [None]:
plt.boxplot(df["Tot_cost"])

In [None]:
plt.boxplot(df["Tot_charg"])

In [None]:
genuine = df[df["Result"] == 1]
fraud = df[df["Result"] == 0]

## Random Under Sampling

In [None]:
genuine = genuine.sample(fraud.shape[0])
genuine.shape

In [None]:
new_df = pd.concat([genuine, fraud], axis = 0)

In [None]:
new_df.head()

In [None]:
new_df.shape

In [None]:
sns.countplot(new_df["Result"])

In [None]:
X = new_df.drop(["Result"], axis = 1)
y = new_df["Result"]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 20)

## Running Algorithms

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()

dt_clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
dt_clf_pred = dt_clf.predict(X_test)

confusion_matrix(dt_clf_pred, y_test)

In [None]:
accuracy = accuracy_score(dt_clf_pred, y_test)
acc = accuracy*100
print(f"Test Accuracy = {round(acc, 2)}")

In [None]:
accuracy_train = accuracy_score(y_train, dt_clf.predict(X_train))
print(f"Train Accuracy: {round(accuracy_train*100, 2)}")

In [None]:
df.dropna(inplace = True)

In [None]:
df.shape

In [None]:
ins_dt = df.copy()

In [None]:
pd.DataFrame.drop_duplicates(ins_dt, inplace = True)
ins_dt.shape

## Random Over Sampling

In [None]:
X = ins_dt.drop(["Result"], axis = 1)
y = ins_dt["Result"]

In [None]:
# import library
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

ros = RandomOverSampler(random_state=42)

# fit predictor and target variable
X_ros, y_ros = ros.fit_resample(X, y)

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_ros))

In [None]:
X_ros_train, X_ros_test, y_ros_train, y_ros_test = train_test_split(X_ros, y_ros, test_size = .20)

In [None]:
X_ros_train.shape

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gradient_clf = GradientBoostingClassifier(
    loss = 'exponential', 
    criterion = 'mse',
    n_estimators = 200,
    learning_rate=0.2,

)

In [None]:
gradient_clf.fit(X_ros_train, y_ros_train)

In [None]:
gd_pred = gradient_clf.predict(X_ros_test)
train_pred = gradient_clf.predict(X_ros_train)

In [None]:
print("Train Accuracy: ")
accuracy_score(y_ros_train, train_pred)

In [None]:
print("Test Accuracy: ")
accuracy_score(y_ros_test, gd_pred)

In [None]:
X_ros_train.head()

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier(
    criterion='gini',
)

In [None]:
dt_clf.fit(X_ros_train, y_ros_train)

In [None]:
dt_clf_pred = dt_clf.predict(X_ros_test)

confusion_matrix(dt_clf_pred, y_ros_test)

In [None]:
accuracy = accuracy_score(dt_clf_pred, y_ros_test)
acc = accuracy*100
print(f"Test Accuracy = {round(acc, 2)}")

In [None]:
accuracy_train = accuracy_score(y_ros_train, dt_clf.predict(X_ros_train))
print(f"Train Accuracy: {round(accuracy_train*100, 2)}")

In [None]:
from sklearn.ensemble import BaggingClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators = 300,       ## Total Number of decision tree that will be used to train an ensemble is 2
    max_samples = 100,         ## each trained on 100 training instances randomly sampled from the training set with replacement
    #bootstrap = True,         ## Bootstrap = True means use bagging method, if this option is set to False then it will be Pasting method that we didn't mention here.
    n_jobs = -1               ## n_jobs means how many cores will be used to train the ensemble and -1 here means all of them
)

bag_clf.fit(X_ros_train, y_ros_train)

In [None]:
pred = bag_clf.predict(X_ros_test)
confusion_matrix(y_ros_test, pred)

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest_classifier = RandomForestClassifier(n_estimators = 200, criterion = 'gini', n_jobs = -1)

In [None]:
forest_classifier.fit(X_ros_train, y_ros_train)
y_ros_pred = forest_classifier.predict(X_ros_test)

In [None]:
confusion_matrix(y_ros_pred, y_ros_test)

In [None]:
print(f"Accuracy: {round((accuracy_score(y_ros_pred, y_ros_test)*100), 2)}%")

In [None]:
accuracy_train = accuracy_score(y_ros_train, dt_clf.predict(X_ros_train))
print(f"Train Accuracy: {round(accuracy_train*100, 2)}")

## Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
# activation : {'identity', 'logistic', 'tanh', 'relu'}
# solver : {'lbfgs', 'sgd', 'adam'}
# learning_rate : {'constant', 'invscaling', 'adaptive'}

clf = MLPClassifier(hidden_layer_sizes = (40,), activation = 'relu', solver = "lbfgs")

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = [{'activation': ['identity', 'logistic', 'tanh', 'relu'],
          'solver' : ['lbfgs', 'sgd', 'adam'],
          'learning_rate' : ['constant', 'invscaling', 'adaptive']}]

grid_search = GridSearchCV(estimator = clf, param_grid = params, scoring = 'accuracy', cv = 10, n_jobs = -1)

In [None]:
grid_search.fit(X_ros_train, y_ros_train)

In [None]:
help(MLPClassifier)

In [None]:
clf.fit(X_ros_train, y_ros_train)

In [None]:
clf.predict_proba(X_test[:1])

In [None]:
mlp_pred = clf.predict(X_ros_test)

In [None]:
accuracy_score(mlp_pred, y_ros_test)

In [None]:
clf.score(X_test, y_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier(criterion = "entropy", random_state = 50)

dt_clf.fit(X_ros_train, y_ros_train)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix


In [None]:
dt_clf_pred = dt_clf.predict(X_ros_test)

confusion_matrix(dt_clf_pred, y_ros_test)

In [None]:
accuracy = accuracy_score(dt_clf_pred, y_ros_test)
acc = accuracy*100
print(f"Test Accuracy = {round(acc, 2)}")

In [None]:
accuracy_train = accuracy_score(y_ros_train, dt_clf.predict(X_ros_train))
print(f"Train Accuracy: {round(accuracy_train*100, 2)}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest_classifier = RandomForestClassifier(n_estimators = 200, criterion = 'gini', n_jobs = -1)

In [None]:
forest_classifier.fit(X_ros_train, y_ros_train)
y_ros_pred = forest_classifier.predict(X_ros_test)
confusion_matrix(y_ros_pred, y_ros_test)

In [None]:
accuracy = accuracy_score(y_ros_pred, y_ros_test)
acc = accuracy*100
print(f"Test Accuracy = {round(acc, 2)}%")

In [None]:
accuracy_train = accuracy_score(y_ros_train, dt_clf.predict(X_ros_train))
print(f"Train Accuracy: {round(accuracy_train*100, 2)}%")

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest_classifier = RandomForestClassifier(n_estimators = 200, criterion = 'gini', n_jobs = -1)

In [None]:
forest_classifier.fit(X_df_train, y_df_train)
y_df_pred = forest_classifier.predict(X_df_test)
confusion_matrix(y_df_pred, y_df_test)

In [None]:
accuracy = accuracy_score(y_df_pred, y_df_test)
acc = accuracy*100
print(f"Test Accuracy = {round(acc, 2)}%")

In [None]:
accuracy_train = accuracy_score(y_df_train, dt_clf.predict(X_df_train))
print(f"Train Accuracy: {round(accuracy_train*100, 2)}%")

In [None]:
df = insurance_data[["Hospital County", "Hospital Id", "Cultural_group", "ethnicity", "Days_spend_hsptl", "Admission_type", "Home or self care,", "ccs_procedure_code", "apr_drg_description", "Code_illness", "Mortality risk", "Surg_Description", "Weight_baby", "Emergency dept_yes/No", "Tot_charg", "Tot_cost", "Payment_Typology", "Result"]]

In [None]:
df.head()

In [None]:
insurance_data.info()

In [None]:
df.info()

In [None]:
genuine = df[df["Result"] == 1]
fraud = df[df["Result"] == 0]

In [None]:
genuine = genuine.sample(500000)

In [None]:
new_df = pd.concat([genuine, fraud], axis = 0)

In [None]:
new_df.head()

In [None]:
new_df.info()

In [None]:
sns.countplot(new_df["Result"])

In [None]:
X = new_df.drop(columns = ["Result"])
y = new_df["Result"]

In [None]:
from collections import Counter

In [None]:
# import library
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)

# fit predictor and target variable
X_ros, y_ros = ros.fit_resample(X, y)

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_ros))

In [None]:
sns.countplot(y_ros)

In [None]:
X_ros.shape

In [None]:
X_ros_train, X_ros_test, y_ros_train, y_ros_test = train_test_split(X_ros, y_ros, test_size = .25)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()

dt_clf.fit(X_ros_train, y_ros_train)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
dt_clf_pred = dt_clf.predict(X_ros_test)

confusion_matrix(dt_clf_pred, y_ros_test)

In [None]:
accuracy = accuracy_score(dt_clf_pred, y_ros_test)
acc = accuracy*100
print(f"Test Accuracy = {round(acc, 2)}")

In [None]:
accuracy_train = accuracy_score(y_ros_train, dt_clf.predict(X_ros_train))
print(f"Train Accuracy: {round(accuracy_train*100, 2)}")

In [None]:
sns.countplot(df["Result"])

In [None]:
X_df = df.drop(["Result"], axis = 1)
y_df = df["Result"]
# X_df.head()

In [None]:
# import library
from imblearn.over_sampling import RandomOverSampler

ros2 = RandomOverSampler(random_state=42)

# fit predictor and target variable
X, y = ros2.fit_resample(X_df, y_df)

print('Original dataset shape', Counter(y_df))
print('Resample dataset shape', Counter(y))

In [None]:
X_df_train, X_df_test, y_df_train, y_df_test = train_test_split(X, y, test_size = .30, random_state = 20)

### Random Over Sampling with decision tree

In [None]:
dt_clf.fit(X_df_train, y_df_train)

In [None]:
y_df_pred = dt_clf.predict(X_df_test)
confusion_matrix(y_df_pred, y_df_test)

In [None]:
accuracy = accuracy_score(y_df_pred, y_df_test)
acc = accuracy*100
print(f"Test Accuracy = {round(acc, 2)}%")

In [None]:
accuracy_train = accuracy_score(y_df_train, dt_clf.predict(X_df_train))
print(f"Train Accuracy: {round(accuracy_train*100, 2)}%")

In [None]:
dt_clf = DecisionTreeClassifier(criterion = 'gini', max_depth = 150, random_state = 30)

In [None]:
dt_clf.fit(X_df_train, y_df_train)

In [None]:
y_df_pred = dt_clf.predict(X_df_test)
confusion_matrix(y_df_pred, y_df_test)

In [None]:
accuracy = accuracy_score(y_df_pred, y_df_test)
acc = accuracy*100
print(f"Test Accuracy = {round(acc, 2)}%")

In [None]:
accuracy_train = accuracy_score(y_df_train, dt_clf.predict(X_df_train))
print(f"Train Accuracy: {round(accuracy_train*100, 2)}%")

### Random Over Sampling with Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest_classifier = RandomForestClassifier(n_estimators = 200, criterion = 'gini', n_jobs = -1)

In [None]:
forest_classifier.fit(X_df_train, y_df_train)
y_df_pred = forest_classifier.predict(X_df_test)
confusion_matrix(y_df_pred, y_df_test)

In [None]:
accuracy = accuracy_score(y_df_pred, y_df_test)
acc = accuracy*100
print(f"Test Accuracy = {round(acc, 2)}%")

In [None]:
accuracy_train = accuracy_score(y_df_train, dt_clf.predict(X_df_train))
print(f"Train Accuracy: {round(accuracy_train*100, 2)}%")

### RandomForestClassifier with RandomOverSampling

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest_classifier = RandomForestClassifier(n_estimators = 300, criterion = 'gini', n_jobs = -1)

In [None]:
forest_classifier.fit(X_df_train, y_df_train)
y_df_pred = forest_classifier.predict(X_df_test)
confusion_matrix(y_df_pred, y_df_test)

In [None]:
accuracy = accuracy_score(y_df_pred, y_df_test)
acc = accuracy*100
print(f"Test Accuracy = {round(acc, 2)}%")

In [None]:
accuracy_train = accuracy_score(y_df_train, forest_classifier.predict(X_df_train))
print(f"Train Accuracy: {round(accuracy_train*100, 2)}%")

### Random Forest With 200 estimators and gini

In [None]:
from sklearn.ensemble import RandomForestClassifier
# forest_classifier = RandomForestClassifier(n_estimators = 200, criterion = 'gini', n_jobs = -1)
# fc_model = forest_classifier.fit(X_ros_train, y_ros_train)
# fc_ros_pred = fc_model.predict(X_ros_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
# confusion_matrix(fc_ros_pred, y_ros_test)

In [None]:
# accuracy = accuracy_score(fc_ros_pred, y_ros_test)
# acc = accuracy*100
# print(f"Test Accuracy = {round(acc, 2)}")

In [None]:
# accuracy_train = accuracy_score(y_ros_train, fc_model.predict(X_ros_train))
# print(f"Train Accuracy: {round(accuracy_train*100, 2)}")

### Random Forest with 300 estimators and gini

In [None]:
forest_classifier2 = RandomForestClassifier(n_estimators = 300, criterion = 'gini', n_jobs = -1)

In [None]:
fc_model2 = forest_classifier2.fit(X_ros_train, y_ros_train)

In [None]:
fc_ros_pred2 = fc_model2.predict(X_ros_test)

In [None]:
# from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
confusion_matrix(fc_ros_pred2, y_ros_test)

In [None]:
accuracy2 = accuracy_score(fc_ros_pred2, y_ros_test)
acc2 = accuracy2*100
print(f"Test Accuracy = {round(acc2, 2)}")

In [None]:
accuracy_train2 = accuracy_score(y_ros_train, fc_model2.predict(X_ros_train))
print(f"Train Accuracy: {round(accuracy_train2*100, 2)}")

### Random Forest with 200 estimators and entropy

In [None]:
forest_classifier3 = RandomForestClassifier(n_estimators = 200, criterion = 'entropy', n_jobs = -1)

In [None]:
fc_model3 = forest_classifier3.fit(X_ros_train, y_ros_train)

In [None]:
fc_ros_pre3 = fc_model3.predict(X_ros_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
confusion_matrix(fc_ros_pred3, y_ros_test)

In [None]:
accuracy3 = accuracy_score(fc_ros_pred3, y_ros_test)
acc3 = accuracy3*100
print(f"Test Accuracy = {round(acc3, 2)}")

In [None]:
accuracy_train3 = accuracy_score(y_ros_train, fc_model3.predict(X_ros_train))
print(f"Train Accuracy: {round(accuracy_train3*100, 2)}")

In [None]:
insurance_data.info()

## All the values which are greater than .1.

### We will take these columns for now.

In [None]:
fe_insurance_data = insurance_data[["Area_Service", "Age", "Gender", "Cultural_group", "ethnicity", "Days_spend_hsptl", "Admission_type", "Tot_charg", "Tot_cost", "Result"]]

fe_insurance_data.head()

In [None]:
fe_insurance_data.head()

In [None]:
sns.countplot(fe_insurance_data["Result"])

In [None]:
X = fe_insurance_data.drop(columns = ["Result"])
y = fe_insurance_data["Result"]

In [None]:
# import library
from imblearn.over_sampling import SMOTE

smote = SMOTE()

# fit predictor and target variable
x_smote, y_smote = smote.fit_resample(X, y)

In [None]:
plt.figure(figsize = (4, 5))
sns.countplot(y_smote)

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix

xs_train, xs_test, ys_train, ys_test = train_test_split(x_smote, y_smote, test_size = .30, random_state = 20)

In [None]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_validate

rf_model = RandomForestClassifier(n_estimators = 100, criterion = 'gini', n_jobs = -1, random_state = 50)

In [None]:
rf_model.fit(xs_train, ys_train)

In [None]:
score = cross_validate(rf_model, xs_train, ys_train)

In [None]:
score

In [None]:
accuracy_score(ys_train, rf_model.predict(xs_train))

In [None]:
rf_pred = rf_model.predict(xs_test)

acc = accuracy_score(ys_test, rf_pred)

print(f"Test Accuracy : {round(acc*100, 2)}%")

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
def dec_tree(criter, max_depth):
    _accuracy = []
    for md in range(2, max_depth+1, 2):
        dt_clf = DecisionTreeClassifier(criterion = criter, max_depth = md, random_state=50)
        model = dt_clf.fit(xs_train, ys_train)
        predictions = model.predict(xs_test)
        acc = accuracy_score(predictions, ys_test)
        _accuracy.append(acc)
    return _accuracy

In [None]:
gini_tree = dec_tree("gini", 101)
entr_tree = dec_tree("entropy", 101)

In [None]:
acc_df = pd.DataFrame({"max_depth": range(2, 101, 2), "gini_accuracy": gini_tree, "entr_accuracy": entr_tree})

In [None]:
acc_df

In [None]:
from collections import Counter

In [None]:
# import library
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)

# fit predictor and target variable
x_ros, y_ros = ros.fit_resample(X, y)

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_ros))

In [None]:
xos_train, xos_test, yos_train, yos_test = train_test_split(x_ros, y_ros, test_size = 0.3, random_state = 50)

In [None]:
rf_model.fit(xos_train, yos_train)

In [None]:
score = cross_validate(rf_model, xos_train, yos_train)

In [None]:
score

In [None]:
accuracy_score(yos_train, rf_model.predict(xos_train))

In [None]:
rf_pred2 = rf_model.predict(xos_test)

acc2 = accuracy_score(yos_test, rf_pred2)

print(f"Test Accuracy : {round(acc2*100, 2)}%")

In [None]:
random_forest_model = RandomForestClassifier(n_estimators = 300, criterion = 'gini', n_jobs = -1, random_state = 50)

In [None]:
random_forest_model.fit(xos_train, yos_train)

In [None]:
score = cross_validate(random_forest_model, xos_train, yos_train)

In [None]:
score

In [None]:
accuracy_score(yos_train, random_forest_model.predict(xos_train))

In [None]:
rf_pred2 = random_forest_model.predict(xos_test)

acc2 = accuracy_score(yos_test, rf_pred2)

print(f"Test Accuracy : {round(acc2*100, 2)}%")