### Step1: Import the necessary libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.stats.proportion as stats_pro
%matplotlib inline
sns.set(color_codes=True)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


### Step2: Read the data as a dataframe

In [None]:
insurance=pd.read_csv('/kaggle/input/insurance/insurance.csv')
insurance.head()

### Step3: Perform basic EDA which should include the following and print out your insights at every step

#### Step3a: Shape of the data

In [None]:
shape_insurance=insurance.shape
print('The shape of the dataframe insurance is',shape_insurance,'which means there are',shape_insurance[0],'rows and',shape_insurance[1],'columns.')

#### step 3b. Data type of each attribute

In [None]:
#The data type be found through the info function
insurance.info()

In [None]:
#The data type also be found through the dtype function
print('The data type of attribute age is',insurance['age'].dtype)
print('The data type of attribute sex is',insurance['sex'].dtype)
print('The data type of attribute bmi is',insurance['bmi'].dtype)
print('The data type of attribute children is',insurance['children'].dtype)
print('The data type of attribute smoker is',insurance['smoker'].dtype)
print('The data type of attribute region is',insurance['region'].dtype)
print('The data type of attribute charges is',insurance['charges'].dtype)

#### step 3c. Checking the presence of missing values.<br>
#### There are multiple ways to do it:
##### 1. Referring the third column in the output of the info() command above. It is clear that there are no null values. Since the total rows are 1338 and all columns mention 1338 as non null values.
##### 2. Another way to find missing values is leveraging the code isnull on the dataframe

In [None]:
print('The missing values in the dataframe Insurance are:','\n',insurance.isnull().sum(),'\n','which means there are no null values in the dataset')

##### 3. Another way to find missing values is leveraging the code isnull on the individual column in the dataframe

In [None]:
# Another way to find the missing values
print('The missing values in the attribute age are',insurance['age'].isnull().sum())
print('The missing values in the attribute age are',insurance['sex'].isnull().sum())
print('The missing values in the attribute age are',insurance['bmi'].isnull().sum())
print('The missing values in the attribute age are',insurance['children'].isnull().sum())
print('The missing values in the attribute age are',insurance['smoker'].isnull().sum())
print('The missing values in the attribute age are',insurance['region'].isnull().sum())
print('The missing values in the attribute age are',insurance['charges'].isnull().sum())


#### Step 3.d 5 point summary of numerical attributes.

In [None]:
insurance.describe().T

In [None]:
insurance_5pt=insurance.describe().loc[['min','25%','50%','75%','max'],['age','bmi','children','charges']].T
print('The 5 point summary of numerical attribute is:','\n',insurance_5pt)

1. The range of age is [18,64] with a median of 39. At this point, it doesnt appear that there are any outliars; we will try and confirm the same leveraging box plot.<br>
2. The range of bmi is [15.96,53.13] with a median of 30.4. At this point, it doesnt appear that there are any outliars; we will try and confirm the same leveraging box plot. <br>
3. The range of children in [0,5] with a median of 1. There are no values in the 25% percentile and median is 1 which seems to mean that there are few people with more than 2 children <br>
4. The range of charges is [1121.8,63770.4] with a median of 9382. The difference between median and the maximum value is significant which might mean that there are potential outliars and potentially the data is skewed. We will try and confirm the same leveraging box plot and analysis of skewness.

#### Step 3e Distribution of ‘bmi’, ‘age’ and ‘charges’ columns.

In [None]:
plt.hist(insurance['bmi'])
plt.xlabel('bmi')
plt.ylabel('count')
plt.title('Distribution of BMI')
plt.show()

In [None]:
plt.hist(insurance['age'])
plt.xlabel('age')
plt.ylabel('count')
plt.title('Distribution of Age')
plt.show()

In [None]:
plt.hist(insurance['charges'])
plt.xlabel('charges')
plt.ylabel('count')
plt.title('Distribution of Charges')
plt.show()

#### 3f. Measure of skewness of ‘bmi’, ‘age’ and ‘charges’ columns

In [None]:
skewness_bmi=round(stats.skew(insurance['bmi']),4)
skewness_age=round(stats.skew(insurance['age']),4)
skewness_charges=round(stats.skew(insurance['charges']),4)

print(' The skewness of bmi is', skewness_bmi,'\n','The skewness of age is',skewness_age,'\n','The skewness of charges is',skewness_charges)

bmi has less skewness<br>
Age has negligent skewness<br>
Charges seems highly skewed.

#### 3g. Checking the presence of outliers in ‘bmi’, ‘age’ and ‘charges columns'

In [None]:
bmi_boxplot=sns.boxplot(insurance['bmi']);
print(' As seen in the previous step, skewness is very less for BMI', '\n','checking if there are any outliars by ploting a box plot.','\n' ,' As seen in the chart below,There are outliars on the right.','\n')
plt.show()

In [None]:
age_boxplot=sns.boxplot(insurance['age']);
print(' As checked in the previous step, there is negligible skewness in age.', '\n','Checking if there are any outliars by ploting a box plot.','\n' ,'As seen in the chart below, there doesnt seem to be an outliar.')
plt.show()

In [None]:
charges_boxplot=sns.boxplot(insurance['charges']);
print(' As seen in the above step, charges have high skewness.','\n' ,'Checking if there are any outliars by ploting a box plot.',  '\n' ,'There are outliars on the right.')
plt.show()

#### 3h. Distribution of categorical columns (include children) 

In [None]:
#The categorical columns are sex,smoker,region,children
# Distribution of Sex
sns.countplot(insurance['sex'])
plt.xlabel('Gender')
plt.ylabel('count')
plt.title('Distribution of genders')
plt.show()

In [None]:
# Distribution of smoker
sns.countplot(insurance['smoker'])
plt.xlabel('smoker')
plt.ylabel('count')
plt.title('Distribution of smoker')
plt.show()

In [None]:
# Distribution of region
sns.countplot(insurance['region'])
plt.xlabel('region')
plt.ylabel('count')
plt.title('Distribution of region')
plt.show()

In [None]:
# Distribution of children
sns.countplot(insurance['children'])
plt.xlabel('children')
plt.ylabel('count')
plt.title('Distribution of children')
plt.show()

#### 3i. Pair plot that includes all the columns of the data frame

In [None]:
#Pair plot doesnt contain display non numeric values. Hence, we will have to convert non numeric columns into numbers. 
#The non-numeric columns are sex, smoke and BMI.
insurance_pp=insurance.copy()
insurance_pp['sex']=insurance_pp['sex'].astype('category').cat.codes
insurance_pp['smoker']=insurance_pp['smoker'].astype('category').cat.codes
insurance_pp['region']=insurance_pp['region'].astype('category').cat.codes

In [None]:
sns.pairplot(insurance_pp);

1. There seem to be a co-relation between charges and age' since charges seems to increase as the age increases. There are few outliars though.
2. Charges for smoker seem to be higher than non smokers.

### 4. Answer the following questions with statistical evidence

#### 4.a Do charges of people who smoke differ significantly from the people who don't?

step 1: state the null and alternate hypothesis <br>
Ho = Charges of people who smoke don't differ from the people who don't smoke <br>
Ha = Charges of people who smoke differ from the people who don't smoke

step 2: Decide the signification level <br>
here we select alpha = 0.05

step 3: Identify the test statistics <br>
Since in this scenario, we are comparing 2 samples against each other. Hence we can use two sample t-test for this problem. 

step 4: Calculate p value and t statistics

In [None]:
id_smo_charges=np.array(insurance[['charges','smoker']])
id_smo_charges

In [None]:
## separating the charges paid by smokers and non-smokers

# identify charges paid by smokers
smo_charges = id_smo_charges[:,1]=='yes'
smo_charges = id_smo_charges[smo_charges][:,0]

# identify charges paid by non-smoker
non_smo_charges = id_smo_charges[:,1]=='no'
non_smo_charges = id_smo_charges[non_smo_charges][:,0]

In [None]:
t_statistics, p_value = stats.ttest_ind(smo_charges,non_smo_charges)

In [None]:
print(t_statistics, p_value)

In [None]:
# p_value < 0.05. Hence, the null hypothesis is rejected.
# which means that the charges of people who smoke differ significantly from the people who don't smoke.
print(' Two sample t-test p-value',p_value, 'is significantly less than alpha (0.05).','\n' ,'Hence the null hypothesis is rejected.','\n','Therefore charges of people who smoke differ from charges of people who dont smoke')

#### 4.b Does bmi of males differ significantly from that of females?

Step 1: State the null and alternate hypothesis<br>
Null Hypothesis Ho: BMI of males dont differ significantly from that of females <br>
Alternate Hypothesis Ha: BMI of males differ significantly from that of females

Step 2: Decide the significance level<br>
For this problem, the significance level (alpha) selected is 0.05

Step 3: Identify the test statistics <br>
Since in this problem. We are comparing 2 samples against each other. Hence, we can use 2 sample t-test for this problem.

Step 4: Compute the test statistics and p value

In [None]:
bmi_sex=np.array(insurance[['bmi','sex']])
bmi_sex

In [None]:
bmi_male=bmi_sex[:,1]=='male'
bmi_male=bmi_sex[bmi_male][:,0]
bmi_female=bmi_sex[:,1]=='female'
bmi_female=bmi_sex[bmi_female][:,0]

In [None]:
t_statistics, p_value = stats.ttest_ind(bmi_male,bmi_female)
print(t_statistics,p_value)

In [None]:
# p-value is greater than alpha (0.05). Hence, we fail to reject the null hypothesis. 

print(' Two sample t-test p-value is',round(p_value,6),'which is more than alpha (0.05).','\n'' Hence, we fail to reject the null hypothesis; which means that gender has no effect on BMI.')

#### 4.c Is the proportion of smokers significantly different in different genders?

Step 1: Define the null hypothesis and alternate hypothesis <br>
Null hypothesis Ho: proportion of smokers is not significantly different in different genders <br>
Alternate hypothesis Ha: Proportion of smokers significantly different in different genders

Step 2: Establish the significance level <br>
For this problem, the significance level selected is 0.05 (alpha = 0.05)

Step 3: Identify the test statistics <br>
Since in this problem, we are comparing proportion of 2 categorical samples. Hence, we can use test of proportion 

Step 4: Compute the test statistic and p-value

In [None]:
# computing the number of males and females
male_count=insurance['sex'].value_counts()[0]
female_count=insurance['sex'].value_counts()[1]
print(' The total number of males is',male_count,'\n','The total number of females is',female_count)

# computing the number of male and female smokers
male = insurance['sex']=='male'
male_smoker = insurance[male].smoker.value_counts()[1]
female = insurance['sex']=='female'
female_smoker = insurance[female].smoker.value_counts()[1]
print(' The male smoker count is',male_smoker,'\n','The female smoker count is',female_smoker)
print(' The proportion of male smoker is',round(male_smoker/male_count,4),'\n','The proportion of female smoker is',round(female_smoker/female_count,4))

In [None]:
test_statistics,p_value=stats_pro.proportions_ztest([male_smoker,female_smoker],[male_count,female_count])

test_statistics,p_value

print(' The p-value is',round(p_value,4),'which is significantly lower than alpha(0.05).','\n' ,'Hence, the null hypothesis is rejected.','\n','Therefore, the proportion of smokers differ significantly in genders.')


#### 4.d Is the distribution of bmi across women with no children, one child and two children, the same?

In [None]:
#plotting a bar graph to analyse the distribution of BMI
sns.boxplot(data=insurance,x="children",y="bmi",hue="sex");
plt.title('Distribution of BMI')
plt.show()

Referring the graph, there are few outliars. However, it is difficult to deduce if the BMI across women with children 0,1,2 is the same or not. We will try to analyse this statistically

Step 1: Establish the null hypothesis <br>
Null hypothesis Ho= Distribution of bmi across women with no children, one child and two children is same. <br>
Alternate hypothesis Ha= Distribution of bmi across women with no children, one child and two children is not same.

Step 2: Define the significance level <br>
For this problem, the significance level selected is 0.05. Hence alpha = 0.05

Step 3: Identify the test statistics <br>
Here we have 3 groups; i.e. women with children 0,1,2 and we have to analyze whether the BMI for these 3 samples is same or not. Analysis of variance can determine whether the means of these 3 samples are same or different. Hence, the test statistics identified for this problem is One-way ANOVA

Step 4: Compute the test statistics

In [None]:
## in this step we will segregate BMI for all females by the number of children (0,1,2)
bmi_sex=np.array(insurance[['sex','bmi','children']])
#identify all females
bmi_female=bmi_sex[bmi_sex[:,0]=='female']
#bmi for females with 0 children
z_bmi_female=bmi_female[bmi_female[:,2]==0][:,1]
#bmifor females with 1 child
o_bmi_female=bmi_female[bmi_female[:,2]==1][:,1]
#bmi for females with 2 children
t_bmi_female=bmi_female[bmi_female[:,2]==2][:,1]

In [None]:
f_stat,p_value=stats.f_oneway(z_bmi_female,o_bmi_female,t_bmi_female)
print('The statistics computed is',round(f_stat,4),'and the p-value computed is',round(p_value,4))

In [None]:
print(' The p-value is',round(p_value,4),', which is significantly larger than alpha(0.05).','\n','Hence we fail to reject the null hypothesis.','\n', 'Therefore, There is no significant evidence to conclude that BMI for women having 0,1 or 2 children is different.')