In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Pandas
import pandas as pd 

# Matplotlib
import matplotlib.pyplot as plt 
plt.style.use('fivethirtyeight')

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report

# XGBOOST
from xgboost import XGBRFRegressor

# Seaborn
import seaborn as sns

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Data collection

## Read data.

In [None]:
data  = pd.read_csv('../input/medical-insurance-premium-prediction/Medicalpremium.csv')
data.head()

# Exploratory Data Analysis (or EDA)

In [None]:
data.columns

## Check null values

In [None]:
data.isnull().sum()

## Inference
### There are no null record present in our dataset.

## Heatmap

In [None]:
plt.figure(figsize=(14,8))
sns.heatmap(data.corr(), annot = True, cmap='coolwarm',linewidths=.1)
plt.title("Heatmap for correlation between columns")
plt.show()

## Check datatypes of columns

In [None]:
data.info()

In [None]:
data.Age.describe()

# Data visualizations

In [None]:
plt.figure(figsize=(10,5))
plt.hist(data.Age,edgecolor='k')
plt.xlabel("Age")
plt.ylabel("Count");
plt.title("Distribution of Age");

In [None]:
sns.displot(data.Height)
plt.title("Distribution of height");

## Inference:
## The distribution of patient heights right skewed with centre of 168 with no outlier.

In [None]:
sns.displot(data.Weight)
plt.title("Distribution of height");

## Inference:
## The distribution of patient weights left skewed with centre of 75.

In [None]:
data.columns

## Pairplots

In [None]:

sns.pairplot(data,hue = 'PremiumPrice',diag_kind = "kde",kind = "scatter",palette = "husl")
plt.show()

## Dependent and Independent Features

In [None]:
X = data.drop('PremiumPrice',axis=1)
y = data.PremiumPrice

# Normalization

## Normalization scales each input variable separately to the range 0-1, which is the range for floating-point values where we have the most precision.

In [None]:
scalar =  StandardScaler()
X.Age = scalar.fit_transform(X[['Age']])
X.Height = scalar.fit_transform(X[['Height']])
X.Weight = scalar.fit_transform(X[['Weight']])


## To get a good prediction, divide the data into training and testing data, it is because as the name suggests you will train few data points and test few data points, and keep on doing that unless you get good results.


In [None]:

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=43)

# Model

## The model phase is where we implement a variety of Machine Learning algorithms to predict a certain outcome.

In [None]:
models = {
    LinearRegression():'Linear Regression',
    Lasso():'Lasso',
    Ridge():'Ridge',
    XGBRFRegressor():'XGBRFRegressor',
    RandomForestRegressor():'RandomForest'
}
for m in models.keys():
    m.fit(X_train,y_train)


# iNterpret

## To determine how well a model is performing, we often validate its performance on new unseen instances that were not available to the model during training

In [None]:
for model,name in models.items():
     print(f"Accuracy Score for {name} is : ",model.score(X_test,y_test)*100,"%")

# Finding Important Features in Scikit-learn

## 1) Random Forest

In [None]:
random_forest = RandomForestRegressor()
random_forest.fit(X_train,y_train)
feature_imp1 = random_forest.feature_importances_
sns.barplot(x=feature_imp1, y=X.columns)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.show();

## 2) XGBoostRegressor

In [None]:
xgboost =XGBRFRegressor()
xgboost.fit(X_train,y_train)
feature_imp2 = xgboost.feature_importances_
sns.barplot(x=feature_imp2, y=X.columns)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.show();