## Importing Library

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


## Reading Dataset

In [4]:
df = pd.read_csv(r"c:\Users\user\Downloads\Python\insurance.csv")

## Observing the Dataset

observing the row and column

In [5]:
df.shape

(1338, 7)

observing 10 first data

In [6]:
df.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


observing the missing value (null data)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


observing the simple statistics

In [8]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


# Dataset Preprocessing

## Exclude Unwanted Columns

In [9]:
df = df.drop(columns = "children")

In [10]:
df.head(10)

Unnamed: 0,age,sex,bmi,smoker,region,charges
0,19,female,27.9,yes,southwest,16884.924
1,18,male,33.77,no,southeast,1725.5523
2,28,male,33.0,no,southeast,4449.462
3,33,male,22.705,no,northwest,21984.47061
4,32,male,28.88,no,northwest,3866.8552
5,31,female,25.74,no,southeast,3756.6216
6,46,female,33.44,no,southeast,8240.5896
7,37,female,27.74,no,northwest,7281.5056
8,37,male,29.83,no,northeast,6406.4107
9,60,female,25.84,no,northwest,28923.13692


Please keep in mind it is become number because I encoded the region by number already, need to run the cell one by one again to make it back to no number

In [41]:
df["region"].unique()

array([3, 2, 1, 0])

In [42]:
df["region"].value_counts()

region
2    364
3    325
1    325
0    324
Name: count, dtype: int64

In [43]:
df["sex"].value_counts()

sex
1    676
0    662
Name: count, dtype: int64

In [44]:
set(df["region"])

{0, 1, 2, 3}

## Label Encoding <br>

Note: we have to convert the object/string type data into some numeric values.

SEX - The encoder goes with the alphabetical order: <br>

<ul>
<li>female - 0
<li>male - 1

In [11]:
encoder = LabelEncoder()

df["sex"] = encoder.fit_transform(df["sex"])

In [12]:
df.head(10)

Unnamed: 0,age,sex,bmi,smoker,region,charges
0,19,0,27.9,yes,southwest,16884.924
1,18,1,33.77,no,southeast,1725.5523
2,28,1,33.0,no,southeast,4449.462
3,33,1,22.705,no,northwest,21984.47061
4,32,1,28.88,no,northwest,3866.8552
5,31,0,25.74,no,southeast,3756.6216
6,46,0,33.44,no,southeast,8240.5896
7,37,0,27.74,no,northwest,7281.5056
8,37,1,29.83,no,northeast,6406.4107
9,60,0,25.84,no,northwest,28923.13692


SMOKER - The encoder goes with the alphabetical order: <br>

<ul>
<li>no - 0
<li>yes - 1

In [13]:
encoder = LabelEncoder()

df["smoker"] = encoder.fit_transform(df["smoker"])

df.head(10)

Unnamed: 0,age,sex,bmi,smoker,region,charges
0,19,0,27.9,1,southwest,16884.924
1,18,1,33.77,0,southeast,1725.5523
2,28,1,33.0,0,southeast,4449.462
3,33,1,22.705,0,northwest,21984.47061
4,32,1,28.88,0,northwest,3866.8552
5,31,0,25.74,0,southeast,3756.6216
6,46,0,33.44,0,southeast,8240.5896
7,37,0,27.74,0,northwest,7281.5056
8,37,1,29.83,0,northeast,6406.4107
9,60,0,25.84,0,northwest,28923.13692


REGION - The encoder goes with the alphabetical order: <br>

In [14]:
encoder = LabelEncoder()

df["region"] = encoder.fit_transform(df["region"])

df.head(10)

Unnamed: 0,age,sex,bmi,smoker,region,charges
0,19,0,27.9,1,3,16884.924
1,18,1,33.77,0,2,1725.5523
2,28,1,33.0,0,2,4449.462
3,33,1,22.705,0,1,21984.47061
4,32,1,28.88,0,1,3866.8552
5,31,0,25.74,0,2,3756.6216
6,46,0,33.44,0,2,8240.5896
7,37,0,27.74,0,1,7281.5056
8,37,1,29.83,0,0,6406.4107
9,60,0,25.84,0,1,28923.13692


In [40]:
df["region"].unique()

array([3, 2, 1, 0])

# Splitting

## Splitting X and y

### iloc and loc functions from pandas

df.loc[starting_row_index : ending_row_index, [list of columns]] --> if using the name <br> 

df.i loc[starting_row_index, starting_column_index : ending_column_index] --> if using the index location

### 1) using loc

Below is to "INCLUSION"

In [15]:
df.loc[10:17, ["age", "sex"]]

Unnamed: 0,age,sex
10,25,1
11,62,0
12,23,1
13,56,0
14,27,1
15,19,1
16,52,0
17,23,1


Below is to "EXCLUSION"

In [16]:
df.loc[10:17, df.columns!="charges"]

Unnamed: 0,age,sex,bmi,smoker,region
10,25,1,26.22,0,0
11,62,0,26.29,1,2
12,23,1,34.4,0,3
13,56,0,39.82,0,2
14,27,1,42.13,1,2
15,19,1,24.6,0,3
16,52,0,30.78,0,0
17,23,1,23.845,0,0


### 2) using iloc

In [17]:
df.iloc[10:17, 0:3]

Unnamed: 0,age,sex,bmi
10,25,1,26.22
11,62,0,26.29
12,23,1,34.4
13,56,0,39.82
14,27,1,42.13
15,19,1,24.6
16,52,0,30.78


# <font color = yellow> <b> MLR running </font> </b>

## Defining the X and y (splitting)

In [18]:
X = df.loc[:, df.columns!="charges"]
y = df["charges"]

## Checking the data

In [20]:
X.head(10)

Unnamed: 0,age,sex,bmi,smoker,region
0,19,0,27.9,1,3
1,18,1,33.77,0,2
2,28,1,33.0,0,2
3,33,1,22.705,0,1
4,32,1,28.88,0,1
5,31,0,25.74,0,2
6,46,0,33.44,0,2
7,37,0,27.74,0,1
8,37,1,29.83,0,0
9,60,0,25.84,0,1


In [21]:
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

## Split the data into training and testing dataset

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

## Start  

In [24]:
from sklearn.linear_model import LinearRegression

In [25]:
X = df.loc[:, df.columns!="charges"]
y = df["charges"]

#Model Declaration
lin_model = LinearRegression()

#Training
lin_model.fit(X_train, y_train)

m_sk = lin_model.coef_[0]
c_sk = lin_model.intercept_

print(m_sk)
print(c_sk)

259.3875042472401
-11544.814191091153


## Evaluation from Training Dataset

In [30]:
y_prediction_train = lin_model.predict(X_train)

y_prediction_train

array([ 6791.19276006,  8789.13824449,  9716.05786249, ...,
       11926.94419646, 36831.8350893 , 11904.56182173], shape=(1070,))

In [31]:
y_train

560      9193.83850
1285     8534.67180
1142    27117.99378
969      8596.82780
486     12475.35130
           ...     
1095     4561.18850
1130     8582.30230
1294    11931.12525
860     46113.51100
1126    10214.63600
Name: charges, Length: 1070, dtype: float64

## Evaluation from Testing Dataset

In [33]:
y_prediction_test = lin_model.predict(X_test)
y_prediction_test 

array([ 8556.63142984,  7568.92088951, 37450.39068233,  8729.73017417,
       27462.02978858, 10810.04004535,   646.53472691, 17470.07511861,
        1507.01495887, 10493.92757682, 27246.65262812,  9054.97908471,
        5333.82778747, 37682.22011764, 40834.8574377 , 36799.88301253,
       14904.32717931, 36028.18145285,  9227.79779415, 32003.25309314,
        2114.0370884 ,  9695.91659342,  2734.01996932,  7489.08570457,
       10564.1944071 , 13403.91755846, 14944.27045668,  5361.66365966,
        9552.52009474,  2662.9895827 ,  7814.58408602, 13620.40244348,
        5026.55940669,  2640.9926921 ,  4415.80585866, 13453.27351366,
        1915.25336247,  8793.63655935, 33415.32089565, 32694.40637813,
        2133.07487614,  4381.07712076, 14141.09206038, 12006.34499811,
        8432.12327794, 12138.55199635,  4886.79625535,  2249.68851739,
       34767.92769062,  8797.26409604, 16399.62232449,  2405.18079994,
       12423.38015773,  1869.51979751, 13956.07796232, 12173.79729032,
      

In [34]:
y_test

764      9095.06825
887      5272.17580
890     29330.98315
1293     9301.89355
259     33750.29180
           ...     
109     47055.53210
575     12222.89830
535      6067.12675
543     63770.42801
846      9872.70100
Name: charges, Length: 268, dtype: float64

## <font color = yellow> <b> Mean_Absolute_Error, Mean_Squared_Error, R2 Score </b> </font>

In [35]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_prediction_test)
mse = mean_squared_error(y_test, y_prediction_test)
r2 = r2_score(y_test, y_prediction_test)

print("Mean Absolute Error:", mae)
print("Mean Squared Error", mse)
print("R2 Score:", r2)

Mean Absolute Error: 4227.246575990811
Mean Squared Error 34200871.974767454
R2 Score: 0.7797027283499592


## <font color = pink> <b> Saving the Model </b> </font>

In [36]:
import joblib
joblib.dump(lin_model, "MLR_Medical_Cost_SHA.pkl")

['MLR_Medical_Cost_SHA.pkl']

## Finding Prediction for New Datas

### Loading Models

In [39]:
import joblib

loaded_model = joblib.load(r"C:\Users\user\Downloads\Python\Collaboration_Project\Linear_Regression\MLR_Medical_Cost_SHA.pkl")

In [47]:
age = float(input("Input Age: "))
sex = float(input("Input Sex [Female-0, Male-1]: "))
bmi = float(input("BMI: "))
smoker = float(input("Smoking [No-0, Yes-1]: "))
region = float(input("Region [0,1,2,3]: "))

predicted_charges = loaded_model.predict([[age, sex, bmi, smoker, region]])

print("The predicted charge is ", predicted_charges)

The predicted charge is  [2584.50511837]


