# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Reading Dataset

In [41]:
df = pd.read_csv(r"C:\Users\RAZER\Downloads\archive (1)\insurance.csv")

## Observing the dataset

In [42]:
df.shape

(1338, 7)

In [43]:
df.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [44]:
df["region"].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [47]:
set(df["region"])

{'northeast', 'northwest', 'southeast', 'southwest'}

In [45]:
df["region"].value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

In [46]:
df["sex"].value_counts()

sex
male      676
female    662
Name: count, dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [6]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


# Dataset Preprocessing

## Exclude Unwanted Columns

In [7]:
df = df.drop(columns="children")

In [8]:
df.head(10)

Unnamed: 0,age,sex,bmi,smoker,region,charges
0,19,female,27.9,yes,southwest,16884.924
1,18,male,33.77,no,southeast,1725.5523
2,28,male,33.0,no,southeast,4449.462
3,33,male,22.705,no,northwest,21984.47061
4,32,male,28.88,no,northwest,3866.8552
5,31,female,25.74,no,southeast,3756.6216
6,46,female,33.44,no,southeast,8240.5896
7,37,female,27.74,no,northwest,7281.5056
8,37,male,29.83,no,northeast,6406.4107
9,60,female,25.84,no,northwest,28923.13692


## Label Encoding

Note: we have to convert the object/string type data into some numeric values.

In [9]:
encoder = LabelEncoder()

df["sex"] = encoder.fit_transform(df["sex"])

<ul>
<li>female - 0
<li>male - 1
</ul>
Note: The encoder will encode by the alphabetical order

In [10]:
df.head(10)

Unnamed: 0,age,sex,bmi,smoker,region,charges
0,19,0,27.9,yes,southwest,16884.924
1,18,1,33.77,no,southeast,1725.5523
2,28,1,33.0,no,southeast,4449.462
3,33,1,22.705,no,northwest,21984.47061
4,32,1,28.88,no,northwest,3866.8552
5,31,0,25.74,no,southeast,3756.6216
6,46,0,33.44,no,southeast,8240.5896
7,37,0,27.74,no,northwest,7281.5056
8,37,1,29.83,no,northeast,6406.4107
9,60,0,25.84,no,northwest,28923.13692


In [11]:
df["smoker"] = encoder.fit_transform(df["smoker"])
df["region"] = encoder.fit_transform(df["region"])

In [12]:
df.head(10)

Unnamed: 0,age,sex,bmi,smoker,region,charges
0,19,0,27.9,1,3,16884.924
1,18,1,33.77,0,2,1725.5523
2,28,1,33.0,0,2,4449.462
3,33,1,22.705,0,1,21984.47061
4,32,1,28.88,0,1,3866.8552
5,31,0,25.74,0,2,3756.6216
6,46,0,33.44,0,2,8240.5896
7,37,0,27.74,0,1,7281.5056
8,37,1,29.83,0,0,6406.4107
9,60,0,25.84,0,1,28923.13692


# Spliting

## Spliting X and y

### iloc and loc functions from pandas

df.loc[starting_row_index:ending_row_index , [list of columns]] <br>

df.iloc[starting_row_index:ending_row_index , starting_column_index:ending_column_index]

In [17]:
df.loc[10:17,["age","sex"]]

Unnamed: 0,age,sex
10,25,1
11,62,0
12,23,1
13,56,0
14,27,1
15,19,1
16,52,0
17,23,1


In [18]:
df.loc[10:17,df.columns!="charges"]

Unnamed: 0,age,sex,bmi,smoker,region
10,25,1,26.22,0,0
11,62,0,26.29,1,2
12,23,1,34.4,0,3
13,56,0,39.82,0,2
14,27,1,42.13,1,2
15,19,1,24.6,0,3
16,52,0,30.78,0,0
17,23,1,23.845,0,0


In [21]:
df.iloc[10:17, 0:3]

Unnamed: 0,age,sex,bmi
10,25,1,26.22
11,62,0,26.29
12,23,1,34.4
13,56,0,39.82
14,27,1,42.13
15,19,1,24.6
16,52,0,30.78


In [22]:
X = df.loc[:,df.columns!="charges"]
y = df["charges"]

In [23]:
X.head(10)

Unnamed: 0,age,sex,bmi,smoker,region
0,19,0,27.9,1,3
1,18,1,33.77,0,2
2,28,1,33.0,0,2
3,33,1,22.705,0,1
4,32,1,28.88,0,1
5,31,0,25.74,0,2
6,46,0,33.44,0,2
7,37,0,27.74,0,1
8,37,1,29.83,0,0
9,60,0,25.84,0,1


In [24]:
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

## Spliting In Training and Testing

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Teaining

In [31]:
from sklearn.linear_model import LinearRegression



#Model Declaration
lin_model = LinearRegression()

#Training
lin_model.fit(X_train, y_train)

m_sk = lin_model.coef_[0]
c_sk = lin_model.intercept_

print(m_sk)
print(c_sk)

259.3875042472401
-11544.814191091153


# Evaluation

## Testing

In [32]:
y_pred = lin_model.predict(X_test)

## mean_absolute_error, mean_square_error, R2_score

In [33]:
from sklearn.metrics import  mean_absolute_error,mean_squared_error,r2_score

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print("Mean Absolute Error: ",mae)
print("Mean Squared Error: ",mse)
print("R^2 Score: ",r2)

Mean Absolute Error:  4227.246575990811
Mean Squared Error:  34200871.974767454
R^2 Score:  0.7797027283499592


# Saving Models

In [34]:
import joblib

joblib.dump(lin_model, "MLR_RJ.pkl")

['MLR_RJ.pkl']

# Finding Prediction for new datas

## Loading Models

In [35]:
import joblib

loaded_model = joblib.load(r"C:\Users\RAZER\Documents\GitHub_2\Collaboration_Project\Code_From_RJ\MLR_RJ.pkl")


In [49]:
age	= float(input("Input Age: "))
sex	= float(input("Input Sex [Female-0, Male - 1]: "))
bmi	= float(input("BMI: "))
smoker	= float(input("Smoking [No-0, Yes-1]: "))
region = float(input("Region [0 , 1, 2, 3]: "))

predicted_charges = loaded_model.predict([[age,sex,bmi,smoker,region]])

print("The predicted charges is ", predicted_charges)


The predicted charges is  [2584.50511837]


