In [None]:
import pandas as pd
dataset=pd.read_csv('../input/insurance/insurance.csv')
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
dataset.skew()

In [None]:
dataset.isnull().sum().sort_values(ascending=False)

In [None]:
!pip install pandas-profiling

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(dataset, title="EDA Report")
profile

In [None]:
dataset=dataset.drop_duplicates()
dataset.head()

In [None]:
data = dataset.sample(frac=0.99, random_state=42)
data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))

In [None]:
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

In [None]:
!pip install pycaret

In [None]:
from pycaret.regression import * 

In [None]:
data.info()

In [None]:
data.columns

In [None]:
categorical = []
for i in data.columns:
    if (data[i].dtype=='object'):
        categorical.append(i)
print("Categorical Attribute : {}\n ".format(len(categorical)))
for x in range(len(categorical)): 
    print(categorical[x])

In [None]:
data[categorical].nunique()

In [None]:
insurance = setup(data = data,
                  target = 'charges',
                  session_id=1,
                  train_size = 0.8,
                  transform_target = True,
                  normalize = True, #rescale the values of numeric columns
                  handle_unknown_categorical = True, 
                  unknown_categorical_method = 'most_frequent',
                  remove_multicollinearity = True, #rop one of the two features that are highly correlated with each other
                  ignore_low_variance = True,#all categorical features with statistically insignificant variances are removed from the dataset.
                  combine_rare_levels = True,
                  normalize_method='robust',
                 categorical_features=['sex','smoker', 'region'], #categorical features
                  numeric_features=['age',  'bmi', 'children'])

In [None]:
best=compare_models()

In [None]:
rf= create_model('rf')

In [None]:
print(rf)

In [None]:
tuned_rf = tune_model(rf,optimize = 'MAE')

In [None]:
evaluate_model(tuned_rf) #Graphical plot 

In [None]:
predict_model(tuned_rf)# Test data evaluation 

In [None]:
final_rf = finalize_model(tuned_rf) # Final model 
final_rf

In [None]:
predict_model(final_rf)#final test model evaluation 

In [None]:
unseen_predictions = predict_model(final_rf, data=data_unseen)#evaluation on unseen data
unseen_predictions.head()

In [None]:
from pycaret.utils import check_metric
check_metric(unseen_predictions.charges, unseen_predictions.Label, 'MAE')