# Medical Cost 
  > insurance dataset

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import plotly.express as px

In [None]:
df=pd.read_csv("../input/insurance/insurance.csv")

In [None]:
df

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
plt.figure(figsize=(12,6))
sns.displot(data=df,x='charges',kde=True)

In [None]:
round(df['smoker'].value_counts()/df.shape[0]*100,2).plot.pie(autopct='%1.1f%%',colors=['#33cccc','#ff6666'])

In [None]:
sns.catplot(data=df,x='smoker',y='charges')

In [None]:
#Relation between smoker and Charges
fig = px.scatter(df, 
                 x='smoker', 
                 y='charges', 
                 opacity=0.8, 
                 hover_data=['sex'], 
                 title='smoker vs. Charges')
fig.update_traces(marker_size=5)
fig.show()

In [None]:
sns.barplot(x=df['smoker'],y=df['charges'])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
fig = px.histogram(df, 
                   x='age', 
                   marginal='box', 
                   nbins=47, 
                   title='Distribution of Age')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
#Relation between age and Charges
plt.figure(figsize=(10,12))
fig = px.scatter(df, 
                 x='age', 
                 y='charges', 
                 color='smoker', 
                 opacity=0.8, 
                 hover_data=['sex'], 
                 title='Age vs. Charges')
fig.update_traces(marker_size=5)
fig.show()

In [None]:
fig = px.histogram(df, 
                   x='bmi', 
                   marginal='box', 
                   color_discrete_sequence=['red'], 
                   title='Distribution of BMI (Body Mass Index)')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
#Relation between BMI and Charges
fig = px.scatter(df, 
                 x='bmi', 
                 y='charges', 
                 color='smoker', 
                 opacity=0.8, 
                 hover_data=['sex'], 
                 title='BMI vs. Charges')
fig.update_traces(marker_size=5)
fig.show()

In [None]:
sns.catplot(x="children", kind="count", palette="ch:.25", data=df)

In [None]:
fig = px.scatter(df, 
                 x='children', 
                 y='charges', 
                 color='smoker', 
                 opacity=0.8, 
                 hover_data=['sex'], 
                 title='Children vs. Charges')
fig.update_traces(marker_size=5)
fig.show()

In [None]:
round(df['sex'].value_counts()/df.shape[0]*100,2).plot.pie(autopct='%1.1f%%',colors=['skyblue','pink'])

In [None]:
sns.catplot(data=df,x='sex',y='charges')

In [None]:
sns.catplot(data=df,x='sex',y='charges',hue='smoker')

In [None]:
#relation between region & charges
sns.catplot(data=df,x='region',y='charges')

In [None]:
sns.catplot(data=df,x='region',y='charges',hue='smoker')

In [None]:
def normalize(col):
    return (df[col]-df[col].min())/(df[col].max()-df[col].min())

In [None]:
normalize('charges')

In [None]:
df['charges']

In [None]:
df['region'].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
LE={}
for col in ['sex','smoker','region']:
    LE[col]=LabelEncoder()
    df[col]=LE[col].fit_transform(df[col])

In [None]:
df

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(),cbar=True,annot=True)

    - charges & smoker have a strong positive correlation
    - charges & age have a weak positive correlation
    - charges & bmi have a weak positive correlation
    - charges & children have a very weak positive correlation
    - charges & sex have a very weak positive correlation
    - charges & region have a very very weak negative correlation

# Test & Train

In [None]:
x=df.drop('charges',axis=1)
y=df['charges']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train ,X_test,Y_train ,Y_test=train_test_split(x,y,test_size=0.15,random_state=0)

In [None]:
X_train.shape

In [None]:
X_test.shape

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
dt=DecisionTreeRegressor(max_depth=5,max_features=5,random_state=123)

In [None]:
dt.fit(X_train,Y_train)

In [None]:
print('Train score : ',dt.score(X_train,Y_train)*100)

In [None]:
print("Test score : ",dt.score(X_test,Y_test)*100)

In [None]:
new=[[24,0,30.5,2,1,4]]
dt.predict(new)

__________
_________
# **`The Best for this data`**
____________
_________

# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=80,max_depth=6,max_features=5,random_state=123)

In [None]:
rf.fit(X_train,Y_train)

In [None]:
print('Train score : ',rf.score(X_train,Y_train)*100)

In [None]:
print("Test score : ",rf.score(X_test,Y_test)*100)

In [None]:
rf.predict(new)

_____
_____

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr=LinearRegression()

In [None]:
lr.fit(X_train,Y_train)

In [None]:
print('Train score : ',lr.score(X_train,Y_train)*100)

In [None]:
print("Test score : ",lr.score(X_test,Y_test)*100)

In [None]:
lr.predict(new)

# SVR

In [None]:
from sklearn.svm import SVR

In [None]:
svr=SVR(kernel='rbf')

In [None]:
svr.fit(X_train,Y_train)

In [None]:
print('Train score : ',svr.score(X_train,Y_train)*100)

In [None]:
print("Test score : ",svr.score(X_test,Y_test)*100)

In [None]:
svr.predict(new)

# KNN Regressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor
kreg = KNeighborsRegressor(n_neighbors=25)

In [None]:
kreg.fit(X_train,Y_train)

In [None]:
print('Train score : ',kreg.score(X_train,Y_train)*100)

In [None]:
print("Test score : ",kreg.score(X_test,Y_test)*100)

In [None]:
kreg.predict(new)

________
________
________



### **`Samsung Innovation Campus || SIC`**
_______
  >   -  Hadeer Emad
  >   - Abdelrahman Ehab
  >   - zyad Farag