# 1.Uploading the data

In [None]:
!ls 


In [None]:
!pip install kaggle

In [None]:
!kaggle datasets download -d buntyshah/auto-insurance-claims-data 

In [None]:
!unzip \*.zip  && rm *.zip

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from seclea_ai import SecleaAI

In [None]:
df=pd.read_csv('insurance_claims.csv')

In [None]:
df.head()

#2.Understand the dataset 

In [None]:
df.info()

In [None]:
!pip install seclea-ai

In [None]:
seclea = SecleaAI(project_name="Insurance Claim", plat_url="https://tristar-platform.seclea.com")

dataset_metadata = {"index": None, "outcome_name": "fraud_reported", "continuous_features": ["total_claim_amount",'policy_annual_premium','capital-gains','capital-loss','injury_claim','property_claim','vehicle_claim','incident_hour_of_the_day',]}

# seclea.upload_dataset(dataset="insurance_claims.csv", dataset_name="Fraud_Insurance_Claim", metadata=dataset_metadata)

In [None]:
df.head()

In [None]:
#Checking for the missing values

df.isnull().sum()

In [None]:
df.head(100)

In [None]:
def encode_nans(df):
    #dealing with special character
    df['collision_type'] = df['collision_type'].replace('?', np.NaN)
    df['property_damage'] = df['property_damage'].replace('?', np.NaN)
    df['police_report_available'] = df['police_report_available'].replace('?', "NO") # default to no police report present if previously ?
    return df

df = encode_nans(df)
df.head()

In [None]:
# checking for the null Values 
df.isnull().sum()

In [None]:
#Checking for the missing values 
df.isnull().any().any()
# seclea.upload_dataset(dataset="insurance_claims.csv", dataset_name="Fraud_Insurance_Claim", metadata=dataset_metadata)

In [None]:
df.head()

In [None]:
##Check the correlation
df.corr()

In [None]:
## drop the most correlated data 
threshold = 0.97
def drop_correlated(data, thresh):
    # calculate correlations
    corr_matrix = data.corr().abs()
    # get the upper part of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    # columns with correlation above threshold
    redundant = [column for column in upper.columns if any(upper[column] >= thresh)]
    print(f'Columns to drop with correlation > {threshold}: {redundant}')
    data.drop(columns=redundant, inplace=True)
    return data

df = drop_correlated(df, threshold)

In [None]:
df.info()

# 3.Data preprocessing 

In [None]:
#droping the column with more than 90% of missing value
def drop_nulls(df, threshold):
    cols = [x for x in df.columns if df[x].isnull().sum() / df.shape[0] > threshold]
    return df.drop(columns=cols)

df = drop_nulls(df, threshold=0.9)

In [None]:
df.info()

In [None]:
#Catagorical Data 
cat_cols=df.select_dtypes(include=['object']).columns.tolist()
cat_cols


In [None]:
def encode_categorical(df,cat_cols):
  from sklearn.preprocessing import LabelEncoder
  for col in cat_cols:
    if col in df.columns:
        le = LabelEncoder()
        le.fit(list(df[col].astype(str).values))
        df[col] = le.transform(list(df[col].astype(str).values))
  return df

In [None]:
cat_cols=df.select_dtypes(include=['object']).columns.tolist()
df = encode_categorical(df, cat_cols)
df.fillna({"collision_type": -1, "property_damage": -1})
df.info()


In [None]:
df.head()

In [None]:
# store cleared dataset
df.to_csv('Insurance_cleaned.csv', index=False)

In [None]:
import pandas as pd 
df = pd.read_csv('Insurance_cleaned.csv')
df.head()

In [None]:
# define transformation functions
transformations = [encode_nans, (drop_correlated, [0.97]), (drop_nulls, [0.9]), (encode_categorical, {"cat_cols": cat_cols})]


In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('fraud_reported', axis=1)
y = df.fraud_reported

# train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f'''% Positive class in Train = {np.round(y_train.value_counts(normalize=True)[1] * 100, 2)}
% Positive class in Test  = {np.round(y_test.value_counts(normalize=True)[1] * 100, 2)}''')

In [None]:
### testing without SMOTE

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Train
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)


# 4.Balancing the dataset with SMOTE

In [None]:
from imblearn.over_sampling import SMOTE 

sm = SMOTE(random_state=42)

# oversample train set (don't oversample the test ever)
X_sm, y_sm = sm.fit_resample(X_train, y_train)

print(f'''Shape of X before SMOTE: {X_train.shape}
Shape of X after SMOTE: {X_sm.shape}''')


In [None]:
#Normalization
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler
# fit the scaler (train samples only)
scaler = StandardScaler().fit(X_sm)

# transform (scale)
X_sm_scaled = scaler.transform(X_sm)
X_test_scaled = scaler.transform(X_test)
#train
knn_scaled = KNeighborsClassifier()
knn_scaled.fit(X_sm_scaled, y_sm)
#testing
preds_scaled = knn_scaled.predict(X_test_scaled)
# Evaluate accuracy
print(accuracy_score(y_test, preds_scaled))


In [None]:
### testing with confusion matrix with Smote
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score

# Train
model =  RandomForestClassifier(random_state=42)
model.fit(X_sm, y_sm)
preds_forest = model.predict(X_test)

# Evaluate
print(f'Accuracy = {accuracy_score(y_test, preds_forest):.2f}\nRecall = {recall_score(y_test, preds_forest):.2f}\n')


# 5.Hyper-parameter Tuning with various models

In [None]:
 ### Modeling
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

classifiers = {
    "LogisticRegression": LogisticRegression(),
    "GradientBoostingClassifier": GradientBoostingClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier()
}

In [None]:
## score from various models 
from sklearn.model_selection import cross_val_score

for key, classifier in classifiers.items():
    classifier.fit(X_sm, y_sm)
    training_score = cross_val_score(classifier, X_sm, y_sm, cv=5)
    print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")

In [None]:
 ##Use GridSearchCV to find the best parameters.
from sklearn.model_selection import GridSearchCV


# Logistic Regression 
log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params)
grid_log_reg.fit(X_sm, y_sm)
# We automatically get the logistic regression with the best parameters.
log_reg = grid_log_reg.best_estimator_

# DecisionTree Classifier
tree_params = {"criterion": ["gini", "entropy"], "max_depth": list(range(2,4,1)), 
              "min_samples_leaf": list(range(5,7,1))}
grid_tree = GridSearchCV(DecisionTreeClassifier(), tree_params)
grid_tree.fit(X_sm_scaled, y_sm)

# tree best estimator
tree_clf = grid_tree.best_estimator_

In [None]:
# Overfitting Case

log_reg_score = cross_val_score(log_reg, X_sm_scaled, y_sm, cv=5)
print('Logistic Regression Cross Validation Score: ', round(log_reg_score.mean() * 100, 2).astype(str) + '%')

tree_score = cross_val_score(tree_clf, X_sm_scaled, y_sm, cv=5)
print('DecisionTree Classifier Cross Validation Score', round(tree_score.mean() * 100, 2).astype(str) + '%')