# Importing Libraries

In [None]:
pip install catboost

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import itertools   
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
%matplotlib inline


# Importing dataset

In [None]:
df=pd.read_csv('../input/insurance/insurance.csv')

# Inspecting data

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
print("The unique values of categorical variables are")
print(df['sex'].value_counts())
print()
print(df['children'].value_counts())
print()
print(df['region'].value_counts())
print()
print(df['smoker'].value_counts())

# Exploratory Data Analysis

In [None]:
sns.pairplot(df)

In [None]:
df.corr()

In [None]:
sns.heatmap(data=df.corr(), cmap='coolwarm')

In [None]:
sns.displot(df.loc[:,:"region"])

In [None]:
# Box plot for Categorical columns
plt.figure(figsize=(20,20))
plt.subplot(4,2,1)
sns.boxplot(x="region",y="charges",data=df)
plt.subplot(4,2,2)
sns.boxplot(x="smoker",y="charges",data=df)
plt.subplot(4,2,3)
sns.boxplot(x="sex",y="charges",data=df)
plt.subplot(4,2,4)
sns.boxplot(x="children",y="charges",data=df)



# Data Preprocessing


In [None]:
#reducing the outliers
df[["charges"]]= np.log10(df[["charges"]])


In [None]:
#checking outliers
plt.figure(figsize=(20,20))
plt.subplot(4,2,1)
sns.boxplot(x="region",y="charges",data=df)
plt.subplot(4,2,2)
sns.boxplot(x="smoker",y="charges",data=df)
plt.subplot(4,2,3)
sns.boxplot(x="sex",y="charges",data=df)
plt.subplot(4,2,4)
sns.boxplot(x="children",y="charges",data=df)

In [None]:
#function that will change  bmi to a category
def weightCondition(bmi):
  if bmi<18.5:
    return "Underweight"
  elif (bmi>= 18.5)&(bmi< 24.986):
    return "Normal"
  elif (bmi >= 25) & (bmi < 29.926):
    return "Overweight"
  else:
    return "Obese"


In [None]:
#adding weight condition to the dataFrame
df["weight_Condition"]=[weightCondition(val) for val in df["bmi"] ]
df.head(5)

Label Encoding

In [None]:
#List of categorical variables
categorical = ["sex","children","smoker","region","weight_Condition"]
#Converting data types to categorical datatypes
df[categorical] = df[categorical].apply(lambda x: x.astype("category"), axis = 0)
#Creating dummy variables on the dataFrame
df = pd.get_dummies(data = df, columns = categorical, drop_first =True)
df.head()

# Linear Regression

In [None]:
#extracting X and y for the model 
X = df.drop('charges', axis =1)
y=df[["charges"]]
# split the data to train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
X_train.head()

Scaling

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#Instatiating model
lr= LinearRegression()
#Fitting
lr.fit(X_train,y_train)
# Predicting 
predictions=lr.predict(X_test)
#original predictions values
original_predictions=(10**predictions)
print("original value of charges predictions: ", original_predictions)

In [None]:
# Evaluate  accuracy on the test set
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, predictions))

print('Coefficient of determination: %.2f'
      % (r2_score(y_test, predictions)*100)+'%')

# CatBoost Regression

In [None]:
#Instatiating model
cb = CatBoostRegressor(random_state=42)
#Fitting
cb.fit(X_train,y_train,use_best_model=True,eval_set=(X_test,y_test),early_stopping_rounds=30)
# Predicting 
cb_pred = cb.predict(X_test)

In [None]:
# Evaluate  accuracy on the test set
print('Mean squared error: %.2f'
      % mean_squared_error(y_test,cb_pred))

print('Coefficient of determination: %.2f'
      % (r2_score(y_test,cb_pred)*100)+'%')