In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Getting Know the Data

In [None]:
df=pd.read_csv("/kaggle/input/insurance-premium-prediction/insurance.csv")
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.duplicated().sum()

In [None]:
df[df.duplicated()]

In [None]:
df.drop_duplicates(keep="first", inplace=True)

In [None]:
df.shape

In [None]:
df.describe().T

In [None]:
df.describe(include="object").T

## Data Visualization and Analysis

In [None]:
df_categorical=['sex', 'children', 'smoker', 'region']

for i in df_categorical:
    sns.countplot(data=df, x=i)
    plt.show()

In [None]:
for i in df_categorical:
    sns.boxplot(x=i, y="expenses",hue="sex" ,data=df)
    plt.show()

In [None]:
sns.boxplot(x="expenses", data=df)

In [None]:
df[df["expenses"]>55000]

In [None]:
df.sort_values("expenses", ascending=False)[:20]

****

**We can easily see that, smoker has highly corr with expenses**

In [None]:
plt.figure(figsize=(12,8))
sns.histplot(data=df, x="expenses", bins=100, kde=True)

In [None]:
plt.figure(figsize=(12,8))
sns.histplot(data=df, x="age", kde=True)

In [None]:
plt.figure(figsize=(12,8))
sns.histplot(data=df, x="bmi", kde=True)

In [None]:
a=['age', 'sex', 'bmi', 'children', 'smoker', 'region']
for i in a:
    fig=px.histogram(df, x=i, y="expenses", color="sex", marginal="violin")
    fig.show()

**According to smoker/sum of expenses graph, sum of expenses are more on non smokers. It's because of the count of people who smoke and who don't smoke.**

In [None]:
print("Count of people who smoke =",df[df["smoker"]=="yes"].shape[0])
print("Count of people who don't smoke =",df[df["smoker"]=="no"].shape[0])

**The number of non smokers is about 4 times the number of smokers. Despite that, sum of expenses are pretty close**

In [None]:
df1=df[df["smoker"]=="yes"]
fig=px.scatter(df1,x="age", y="expenses", color="sex", trendline="ols",
              title="Smoker")
fig.show()

df2=df[df["smoker"]=="no"]
fig=px.scatter(df2,x="age", y="expenses", color="sex", trendline="ols",
              title="Non Smoker")
fig.show()

In [None]:
df1=df[df["smoker"]=="yes"]
fig=px.scatter(df1,x="bmi", y="expenses", color="sex", trendline="ols",
              title="Smoker")
fig.show()

df2=df[df["smoker"]=="no"]
fig=px.scatter(df2,x="bmi", y="expenses", color="sex", trendline="ols",
              title="Non Smoker")
fig.show()

## Model

In [None]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
df["sex"]=le.fit_transform(df["sex"])
df["smoker"]=le.fit_transform(df["smoker"])
df["region"]=le.fit_transform(df["region"])

In [None]:
df1=df.corr()["expenses"].reset_index()
fig=px.bar(df1, x="index", y="expenses", title="Correlation for Expenses")
fig.show()

In [None]:
x=df.drop(columns=["expenses"], axis=1)
y=df.expenses

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, make_scorer, mean_absolute_error

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x, y, random_state=0)
lr=LinearRegression()
lr.fit(x_train, y_train)
print(lr.score(x_test, y_test))
y_pred=lr.predict(x_train)
print(mean_absolute_error(y_train, y_pred))
print(np.sqrt(mean_squared_error(y_train, y_pred)))

In [None]:
from sklearn import linear_model
reg=linear_model.Ridge(alpha=.5)
reg.fit(x_train, y_train)
y_pred=reg.predict(x_train)
print(mean_absolute_error(y_train, y_pred))
print(np.sqrt(mean_squared_error(y_train, y_pred)))
print(reg.score(x_test, y_test))

In [None]:
reg2=linear_model.Lasso()
reg2.fit(x_train, y_train)
y_pred=reg2.predict(x_train)
print(mean_absolute_error(y_train, y_pred))
print(np.sqrt(mean_squared_error(y_train, y_pred)))
print(reg2.score(x_test, y_test))

In [None]:
from sklearn.preprocessing import PolynomialFeatures
pf=PolynomialFeatures(degree=2)
x_quad=pf.fit_transform(x)
x_train, x_test, y_train, y_test=train_test_split(x_quad, y, random_state=0)
reg3=LinearRegression().fit(x_train,y_train)
y_pred=reg3.predict(x_train)
print(mean_absolute_error(y_train, y_pred))
print(np.sqrt(mean_squared_error(y_train, y_pred)))
print(reg3.score(x_test, y_test))