In [5]:
import pandas as pd 
import numpy as np 
import os 
import warnings 
warnings.filterwarnings("ignore")

os.chdir("E:\Rithik\Quant Finance\Python")
insurance_data = pd.read_csv("insurance.csv")

insurance_data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19.0,female,27.900,0.0,yes,southwest,16884.924
1,18.0,male,33.770,1.0,no,Southeast,1725.5523
2,28.0,male,33.000,3.0,no,southeast,$4449.462
3,33.0,male,22.705,0.0,no,northwest,$21984.47061
4,32.0,male,28.880,0.0,no,northwest,$3866.8552
...,...,...,...,...,...,...,...
1333,50.0,male,30.970,3.0,no,Northwest,$10600.5483
1334,-18.0,female,31.920,0.0,no,Northeast,2205.9808
1335,18.0,female,36.850,0.0,no,southeast,$1629.8335
1336,21.0,female,25.800,0.0,no,southwest,2007.945


Task 1 : Cleaning Data 

In [6]:
insurance_filled = insurance_data.dropna()

#Standardizing region to all lowercase 
insurance_filled['region'] = insurance_filled['region'].str.lower()

#Standardizing sex column male and female 
male = "male"
female = "female"

gender_map = {"female" : female, "woman" : female, "F": female, "man" : male , "M" : male}

insurance_filled['sex']= insurance_filled['sex'].replace(gender_map)

insurance_filled['smoker'] = insurance_filled['smoker'] == "yes"

#Removing "$" from charges and turning it into float64

insurance_filled['charges'] = insurance_filled['charges'].str.strip("$").astype("float64")

# Converting "age" and "children column in absolute values and rounding as whole  
insurance_filled['age'] = insurance_filled['age'].abs().round(0)
insurance_filled['children'] = insurance_filled['children'].abs().round(0)

insurance_filled.sample(10)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1186,20.0,male,35.625,3.0,True,northwest,37465.34375
828,41.0,male,30.78,3.0,True,northeast,39597.4072
316,50.0,male,32.205,0.0,False,northwest,8835.26495
168,19.0,female,31.825,1.0,False,northwest,2719.27975
288,59.0,female,36.765,1.0,True,northeast,47896.79135
1254,34.0,female,27.72,0.0,False,southeast,4415.1588
746,34.0,male,27.0,2.0,False,southwest,11737.84884
139,22.0,female,36.0,0.0,False,southwest,2166.732
802,21.0,male,22.3,1.0,False,southwest,2103.08
960,19.0,female,39.615,1.0,False,northwest,2730.10785


Task 2 : Model Fitting 


In [7]:
df = insurance_filled.copy() 

df_new = pd.get_dummies(df, prefix = ["region"], columns = ["region"])

df_new = df_new.drop(columns = ["region_southwest"])

df_new["smoker"] = df_new["smoker"].astype("int64")

df_new["is_male"] = df_new["sex"] == "male" 

df_new["is_male"] = (df_new["sex"] == "male").astype("int64") 


df_new = df_new.drop(columns = "sex")

df_new = df_new.dropna()

df_new.head()

df_new.info()


<class 'pandas.core.frame.DataFrame'>
Index: 1207 entries, 0 to 1337
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1207 non-null   float64
 1   bmi               1207 non-null   float64
 2   children          1207 non-null   float64
 3   smoker            1207 non-null   int64  
 4   charges           1207 non-null   float64
 5   region_northeast  1207 non-null   bool   
 6   region_northwest  1207 non-null   bool   
 7   region_southeast  1207 non-null   bool   
 8   is_male           1207 non-null   int64  
dtypes: bool(3), float64(4), int64(2)
memory usage: 69.5 KB


In [8]:
!pip install scikit-learn



In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split 
from sklearn.metrics import r2_score

#Defining X and Y variables , X is the independenr variable and Y is target Variable whivh we wnat to predict 
x = df_new.drop(columns = "charges")
y = df_new["charges"]

#Train and Spit data 
x_train, x_test, y_train, y_test = train_test_split(x,y , test_size = 0.2 , random_state = 42) 

#Creating Linear Regression model 
model = LinearRegression()
model.fit(x_train,y_train)
 
#Make prediction on testset 
y_pred = model.predict(x_test)

#Calculate r2 (R-squared) score 
r2 = r2_score(y_test, y_pred)

r2

0.7049323160872814

In [10]:
def helper (df):
    df_new = pd.get_dummies(df, prefix = ["region"], columns = ["region"])

    df_new = df_new.drop(columns = ["region_southwest"])
    df_new['smoker'] = df_new['smoker'] == "yes"

    df_new["smoker"] = df_new["smoker"].astype("int64")

    df_new["is_male"] = (df_new["sex"] == "male").astype("int64") 
    df_new = df_new.drop(columns = ["sex"])
    return df_new

In [11]:
os.chdir("E:\Rithik\Quant Finance\Python") 
df_val = pd.read_csv("validation_dataset.csv")

input_df = helper(df_val)
prediction = model.predict(input_df)

prediction_new = [x if x>0 else 1000 for x in prediction]

df_val["predicted charges"] = prediction_new

df_val.sample(10).round(2)

Unnamed: 0,age,sex,bmi,children,smoker,region,predicted charges
37,52.0,male,38.6,2.0,no,southwest,14479.92
12,60.0,female,24.53,0.0,no,southeast,11380.87
14,20.0,female,28.98,0.0,no,northwest,2695.87
42,29.0,male,27.2,0.0,no,southwest,3735.35
17,92.0,female,69.13,13.0,yes,southeast,64338.48
30,27.0,female,32.4,1.0,no,northeast,6189.17
16,83.0,male,89.1,9.0,no,northwest,42565.77
4,28.0,male,38.06,0.0,no,southeast,6685.43
3,71.0,male,65.5,13.0,yes,southeast,57194.41
1,39.0,male,26.41,0.0,yes,northeast,30956.21
