## Import Packages

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

import pickle

## Prepare Data

In [2]:
df_org = pd.read_csv("insurance.csv")
df_org.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df_org.info();

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [4]:
df_org.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [5]:
df = df_org.copy()

## Handle Missing Data 

In [6]:
df.isna().sum()/len(df)*100

age         0.0
sex         0.0
bmi         0.0
children    0.0
smoker      0.0
region      0.0
charges     0.0
dtype: float64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [8]:
print(df['sex'].unique())
print(df['region'].unique())
print(df['smoker'].unique())

['female' 'male']
['southwest' 'southeast' 'northwest' 'northeast']
['yes' 'no']


In [9]:
df= df.dropna(subset = ['age', 'bmi','children', 'charges'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


## Categotical Encoding

In [10]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [11]:
print(df['sex'].unique())
print(df['region'].unique())
print(df['smoker'].unique())

['female' 'male']
['southwest' 'southeast' 'northwest' 'northeast']
['yes' 'no']


In [12]:
#Encoding the object columns.
from sklearn.preprocessing import LabelEncoder

# sex_encoder = LabelEncoder().fit(['female', 'male'])
sex_encoder = LabelEncoder().fit(df['sex'])

# region_encoder = LabelEncoder().fit(['southwest', 'southeast', 'northwest','northeast'])
region_encoder = LabelEncoder().fit(df['region'])

# smoker_encoder = LabelEncoder().fit(['yes', 'no'])
smoker_encoder = LabelEncoder().fit(df['smoker'])

In [13]:
sex_encoder.classes_

array(['female', 'male'], dtype=object)

In [14]:
#Encode categorical features
df['sex'] = sex_encoder.transform(df['sex'])
df['region'] = region_encoder.transform(df['region'])
df['smoker'] = smoker_encoder.transform(df['smoker'])

In [15]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


## Train/Test Data Split

In [16]:
# drop charges no need

X = df.drop(['smoker','charges'], axis=1)
y = df['smoker']

In [17]:
# Split data into features and target
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



## Model Selection and Hyperparameter Tuning

In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


scaler = StandardScaler()
# Standardize features (important for SVMs)
X_train_std = scaler.fit_transform(X_train) 
X_test_std = scaler.transform(X_test)

In [19]:
# Define the pipeline
pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),  # Normalization step
    ('classifier', KNeighborsClassifier())  # You can replace this with any other classifier
])

# Define the parameter grid to search
param_grid = {
    'classifier__n_neighbors': [3, 5, 7, 9 , 11 ,13],  # Example values for 'n_neighbors'
    # Add more hyperparameters to search and their possible values
    # 'classifier__parameter_name': [value1, value2, ...],
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5)  # You can adjust cv (cross-validation) as needed

# Fit the pipeline with GridSearchCV
grid_search.fit(X_train, y_train)

# Access the best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Now you can use the best estimator for predictions
# Define the pipeline
model_knn = best_estimator

# Fit the pipeline
model_knn.fit(X_train, y_train)

In [20]:
pipeline = Pipeline([
    ('scaler', StandardScaler()), 
    ('model', LogisticRegression())
])

# Define the parameter grid
param_grid = {
 'model__solver': ['liblinear', 'lbfgs', 'sag', 'saga'], 
 'model__C': [0.001, 0.01, 0.1, 1, 10, 100],
}


# Create GridSearchCV object with the pipeline
grid_search = GridSearchCV(pipeline , param_grid , cv=5, scoring='accuracy')
# Fit the model to the data
grid_search.fit(X_train , y_train)

best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Now you can use the best estimator for predictions
# Define the pipeline
model_logistic = best_estimator

# Fit the pipeline
model_logistic.fit(X_train, y_train)

In [21]:
# use scalar

# Define the parameter grid for grid search
param_grid = {'C': [0.001, 0.1, 1, 10, 100, ],
'gamma': [0.1, 0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1, 10]}

# Create an SVM model with a sigmoid kernel
svm_model = SVC(kernel='sigmoid')

# Perform grid search
grid_search = GridSearchCV(svm_model ,param_grid ,cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_std , y_train)

# Extract results from grid search
results = grid_search.cv_results_
c_values = np.array(results['param_C'].data, dtype=float)
gamma_values = np.array(results['param_gamma'].data, dtype=float)
accuracy_scores = results['mean_test_score']

# Reshape results for plotting
# heatmap_data = accuracy_scores.reshape(len(param_grid['C']), len( param_grid['gamma']))

# Plot the accuracy heatmap
# plt.figure(figsize=(12, 8))
# sns.heatmap(heatmap_data, annot=True, cmap='viridis', fmt=".3f",xticklabels=param_grid['gamma'],yticklabels=param_grid['C']) 
# plt.title('Accuracy Heatmap for SVM with Sigmoid Kernel')
# plt.xlabel('Gamma') 
# plt.ylabel('C') 
# plt.show()

# Get the best parameters from grid search
# best_params = grid_search.best_params_ 
# print(f"Best Parameters: {best_params}")

best_model = grid_search.best_estimator_ 
best_params = grid_search.best_params_

print(best_model)
print(best_params)

# Now you can use the best estimator for predictions
# Define the pipeline
model_svm = best_model

# Fit the pipeline
model_svm.fit(X_train_std, y_train)


# Evaluate the best model on the test set
# best_model = grid_search.best_estimator_ 
# predictions = best_model.predict(X_test) 
# accuracy = accuracy_score(y_test , predictions) 
# print(f"Accuracy on Test Set: {accuracy:.3f}")

SVC(C=0.001, gamma=0.1, kernel='sigmoid')
{'C': 0.001, 'gamma': 0.1}


## Model Evaluation

In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to evaluate a model and print metrics
def evaluate_model(model, X_test1, y_test1):
    y_pred = model.predict(X_test1)
    accuracy = accuracy_score(y_test1, y_pred)
    precision = precision_score(y_test1, y_pred,zero_division=1)
    recall = recall_score(y_test1, y_pred,zero_division=1)
    f1 = f1_score(y_test1, y_pred,zero_division=1)
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")

# Evaluate each model
print("K-Nearest Neighbors:")
evaluate_model(model_knn, X_test, y_test)
print("\nLogistic Regression:")
evaluate_model(model_logistic, X_test, y_test)
print("\nSupport Vector Machine:")
evaluate_model(model_svm, X_test_std, y_test)

K-Nearest Neighbors:
Accuracy: 0.79
Precision: 0.00
Recall: 0.00
F1 Score: 1.00

Logistic Regression:
Accuracy: 0.80
Precision: 1.00
Recall: 0.00
F1 Score: 0.00

Support Vector Machine:
Accuracy: 0.80
Precision: 1.00
Recall: 0.00
F1 Score: 0.00


## Select the Best Model

In [23]:
model = model_knn

In [24]:
# Create a tuple or any container object with your variables
data_to_save = (model, smoker_encoder, region_encoder ,sex_encoder)

# Open a file in binary write mode
with open('knn_isSmoker.pkl', 'wb') as file:
    # Save the container object
    pickle.dump(data_to_save, file)