### import Libraries

In [1]:
import pandas as pd
import numpy as np
import config1

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

import pickle
import json

### Data Gathering 

In [2]:
df = pd.read_csv(config1.CSV_FILE_PATH)
df

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


### EDA

In [3]:
df['gender'].value_counts().to_dict()
df['gender'].replace({'male': 1, 'female': 0}, inplace=True)

In [4]:
df['smoker'].value_counts().to_dict()
df['smoker'].replace({'no': 0, 'yes': 1}, inplace=True)

In [5]:
df['region'].value_counts().to_dict()

{'southeast': 364, 'southwest': 325, 'northwest': 325, 'northeast': 324}

In [6]:
df = pd.get_dummies(df, columns=['region'])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1338 non-null   int64  
 1   gender            1338 non-null   int64  
 2   bmi               1338 non-null   float64
 3   children          1338 non-null   int64  
 4   smoker            1338 non-null   int64  
 5   charges           1338 non-null   float64
 6   region_northeast  1338 non-null   uint8  
 7   region_northwest  1338 non-null   uint8  
 8   region_southeast  1338 non-null   uint8  
 9   region_southwest  1338 non-null   uint8  
dtypes: float64(2), int64(4), uint8(4)
memory usage: 68.1 KB


### Scaling

In [8]:
x = df.drop('charges',axis = 1)

normal_scaler = MinMaxScaler()
array = normal_scaler.fit_transform(x)
dfx = pd.DataFrame(array, columns = x.columns)


### Model Training

#### Linear Regression

In [9]:
x = df.drop('charges',axis = 1)
y = df['charges']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=30)
linear_reg = LinearRegression()
linear_reg.fit(x_train, y_train)

LinearRegression()

In [10]:
# Linear Regression Evaluation
train_score = linear_reg.score(x_train, y_train)
print("Training R-Squared Value :", train_score)

test_score = linear_reg.score(x_test, y_test)
print("Testing R-Squared Value :", test_score)

Training R-Squared Value : 0.7477342139223713
Testing R-Squared Value : 0.7598658432511417


#### KNN Regression

In [11]:

# x = df.drop('charges',axis = 1)
y = df['charges']
x_train, x_test, y_train, y_test = train_test_split(dfx,y, test_size=0.2, random_state=30)
knn_reg = KNeighborsRegressor(n_neighbors=7, p = 1)
knn_reg.fit(x_train, y_train)

KNeighborsRegressor(n_neighbors=7, p=1)

In [12]:
### KNN Evaluation
train_score = knn_reg.score(x_train, y_train)
print("Training R-Squared Value :", train_score)

test_score = knn_reg.score(x_test, y_test)
print("Testing R-Squared Value :", test_score)

Training R-Squared Value : 0.8309903950818949
Testing R-Squared Value : 0.7583719172035832


## Save Required Data

#### 1. Model

In [13]:
with open(r'artifacts/regression_model.pkl','wb') as f:
    pickle.dump(linear_reg, f)
    

In [14]:
linear_reg.n_features_in_

9

In [15]:
with open(r'artifacts/knn_reg_model.pkl','wb') as f:
    pickle.dump(knn_reg, f)
    

#### 2. Save Column Names

In [16]:
column_names = x.columns
project_data = {"Column Names" : column_names.tolist(),
               "Gender":{'male': 1, 'female': 0},
               "Smoker" :{'no': 0, 'yes': 1},
               "Age" : "Log Transformation"}

with open(r'artifacts/project_data.json','w') as f:
    json.dump(project_data, f)
    

#### 3. Save Scaler

In [17]:
with open(r'artifacts/normal_scaler.pkl','wb') as f:
    pickle.dump(normal_scaler, f)
    

In [18]:
x.columns

Index(['age', 'gender', 'bmi', 'children', 'smoker', 'region_northeast',
       'region_northwest', 'region_southeast', 'region_southwest'],
      dtype='object')

In [19]:
linear_reg.n_features_in_

9

In [43]:
np.zeros(9)

array([0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [45]:
np.zeros((4,5))

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [46]:
np.zeros((1,9))

array([[0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [47]:
models = [linear_reg,knn_reg]
with open(r'artifacts/models.pkl','wb') as f:
    pickle.dump(models, f)
    