# Data Preprocessing / Data Wrangling

In [None]:
# Import the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pwd  #present working directory

In [None]:
#load the dataset
data=pd.read_csv('insurance.csv')

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
# checking dataset size
data.shape

In [None]:
# Checking the datatype of the parameter(columns)
data.info()

In [None]:
#To Know Column names
data.columns

In [None]:
#Statistical Analysis/Descriptive Statistics
data.describe()

In [None]:
data.mean()

In [None]:
data.median()

In [None]:
data.mode()

In [None]:
data.var()

In [None]:
data.std()

In [None]:
data.head()

In [None]:
#Checking the null values in the dataset
data.isna()

In [None]:
data.isnull().any()

In [None]:
data.isna().sum()

In [None]:
data.isnull()

In [None]:
data.isnull().sum()

In [None]:
# If we have null values in Numerical column we will replace it with mean or median
# If we have null values in Categorical column we will replace it with mode

In [None]:
# incase age colum has missing value we will use below code
data['age'].fillna(data['age'].mean(), inplace=True)

In [None]:
#Checking the value counts for categorical Column
data.region.value_counts()

In [None]:
data.sex.value_counts()

In [None]:
data['smoker'].value_counts()

# Data Visualization

In [None]:
sns.distplot(data['age'])

In [None]:
sns.lineplot(x='age',y='charges', data=data)

In [None]:
plt.pie(data.smoker.value_counts(),colors=['green','red'], labels=['No','Yes'],autopct='%.1f%%')
plt.title('Smoker')

In [None]:
(data.region.value_counts()).index

In [None]:
data.region.value_counts()

In [None]:
a=data.region.value_counts().index

In [None]:
a

In [None]:
b=data.region.value_counts()

In [None]:
b

In [None]:
sns.barplot(x=a, y=b)

In [None]:
sns.barplot((data.region.value_counts()).index,data.region.value_counts())

In [None]:
sns.scatterplot(x=data.age,y=data.charges)
plt.title('Age/Charges')

In [None]:
plt.pie(data.sex.value_counts(),[0,0.2],colors=['orange','yellow'],labels=['Male','Female'],autopct='%.1f%%' )




In [None]:
data.hist(figsize=(10,8))

In [None]:
plt.pie(data.region.value_counts(),[0.5,0,0.2,0], colors=['red','blue','yellow','green'],labels=['southeast','southwest','northwest','norteast'],autopct='%.1f%%')

# Outlier Detection

In [None]:
sns.boxplot(data.bmi)

In [None]:
perc99=data.bmi.quantile(0.99)
perc99

In [None]:
data=data[data.bmi<=perc99]
sns.boxplot(data.bmi)

# Encoding Techniques

In [None]:
data.head()

# Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le=LabelEncoder()

In [None]:
data.sex=le.fit_transform(data.sex)

In [None]:
data.head()

In [None]:
data.smoker=le.fit_transform(data.smoker)

In [None]:
data.head()

# One Hot Encoding

In [None]:
data_min=pd.get_dummies(data,columns=['region'])

In [None]:
data_min.head()

# Correlation

In [None]:
data_min.corr()

# Heatmap

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(data_min.corr(),annot=True)

In [None]:
data_min.corr().charges.sort_values(ascending=False)

In [None]:
data_min.head()

# X and Y Split

In [None]:
# Y is dependent variable(target)

In [None]:
y=data_min['charges']

In [None]:
y.head()

In [None]:
# X is Independent variable or predictors
X=data_min.drop(columns=['charges'],axis=1)

In [None]:
X.head()

# Scaling

In [None]:
# Standard Scaling  -mean=0 and Std=1
# Minmax Scaling ....values after scaling are between 0 and 1

In [None]:
names=X.columns

In [None]:
names

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scale=MinMaxScaler()

In [None]:
X_scaled=scale.fit_transform(X)

In [None]:
X_scaled

In [None]:
X=pd.DataFrame(X_scaled, columns=names)

In [None]:
X.head()

In [None]:
from  sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
y_train

In [None]:
y_test