# Importing the dataset

In [2]:
import pandas as pd
df = pd.read_csv('../../datasets/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Creating a statistical summary
Using the describe() function, we can get a statistical summary of the dataset. This includes the count, mean, standard deviation, minimum and maximum values and the quantiles.
- age
- bmi
- children
- charges

In [3]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


## Knowing the shape of the dataset

In [5]:
df.shape

(1338, 7)

In [7]:
print(f"age: {df.age.count()}")

age: 1338


## Getting information about the dataset

In [8]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [10]:
print(f"age: {df.age.count()}")
print(f"sex: {df.sex.count()}")
print(f"bmi: {df.bmi.count()}")
print(f"children: {df.children.count()}")
print(f"smoker: {df.smoker.count()}")
print(f"region: {df.region.count()}")
print(f"charges: {df.charges.count()}")

age: 1338
sex: 1338
bmi: 1338
children: 1338
smoker: 1338
region: 1338
charges: 1338


In [14]:
print("Unique values in each column: ")
print(f"age: {df.age.nunique()}")
print(f"sex: {df.sex.nunique()}")
print(f"bmi: {df.bmi.nunique()}")
print(f"children: {df.children.nunique()}")
print(f"smoker: {df.smoker.nunique()}")
print(f"region: {df.region.nunique()}")
print(f"charges: {df.charges.nunique()}")

Unique values in each column: 
age: 47
sex: 2
bmi: 548
children: 6
smoker: 2
region: 4
charges: 1337


In [17]:
print("is numeric values in each column: ")
print(f"age: {pd.api.types.is_numeric_dtype(df.age)}")
print(f"sex: {pd.api.types.is_numeric_dtype(df.sex)}")
print(f"bmi: {pd.api.types.is_numeric_dtype(df.bmi)}")
print(f"children: {pd.api.types.is_numeric_dtype(df.children)}")
print(f"smoker: {pd.api.types.is_numeric_dtype(df.smoker)}")
print(f"region: {pd.api.types.is_numeric_dtype(df.region)}")
print(f"charges: {pd.api.types.is_numeric_dtype(df.charges)}")

Unique values in each column: 
age: True
sex: False
bmi: True
children: True
smoker: False
region: False
charges: True


In [16]:
print("Types of each column: ")
print(f"age: {df.age.dtype}")
print(f"sex: {df.sex.dtype}")
print(f"bmi: {df.bmi.dtype}")
print(f"children: {df.children.dtype}")
print(f"smoker: {df.smoker.dtype}")
print(f"region: {df.region.dtype}")
print(f"charges: {df.charges.dtype}")

Types of each column: 
age: int64
sex: object
bmi: float64
children: int64
smoker: object
region: object
charges: float64


In [18]:
print("Total of null values for each column: ")
print(f"age: {df.age.isnull().sum()}")
print(f"sex: {df.sex.isnull().sum()}")
print(f"bmi: {df.bmi.isnull().sum()}")
print(f"children: {df.children.isnull().sum()}")
print(f"smoker: {df.smoker.isnull().sum()}")
print(f"region: {df.region.isnull().sum()}")
print(f"charges: {df.charges.isnull().sum()}")

Total of null values for each column: 
age: 0
sex: 0
bmi: 0
children: 0
smoker: 0
region: 0
charges: 0


## knowing the data type

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


### Importing numpy as np

In [19]:
import numpy as np

In [22]:
print(df.charges.min()) # minimum value
print(df.charges.quantile(.25))  # 25% of the data is below this value
print(df.charges.quantile(.50))  # 50% of the data is below this value
print(df.charges.quantile(.75))  # 75% of the data is below this value
print(df.charges.max()) # maximum value

print(df.charges.mean()) 
print(df.charges.median())
print(df.charges.mode().values[0])

1121.8739
4740.28715
9382.033
16639.912515
63770.42801
13270.422265141257
9382.033
1639.5631


## Standard deviation
A measure of how spread out numbers are. A low standard deviation indicates that the numbers tend to be very close to the mean (also called the expected value) of the set, while a high standard deviation indicates that the numbers are spread out over a wider range.


In [23]:
# Pandas Standard Deviation
df.charges.std() # Based on the sample

12110.011236694001

In [24]:
# Numpy Standard Deviation
np.std(df.charges) # Based on the population

12105.484975561612

## (AB) Normality: Skew, Kurtosis, QQ-plot
- Skewness is a measure of the asymmetry of the probability distribution of a real-valued random variable about its mean. The skewness value can be positive or negative, or even undefined. A distribution, or data set, is symmetric if it looks the same to the left and right of the center point.
    * A positive skewness indicates that there is a long tail in the distribution to the right of the mean. The mean and median will be greater than the mode.
    * A negative skewness indicates that there is a long tail in the distribution to the left of the mean. The mean and median will be less than the mode.
    * A skewness of zero indicates that there is no skew (that the distribution is perfectly symmetrical).
    * A skewness value close to zero indicates that the data are fairly symmetrical.
    * A skenewss value greater than 1 or less than -1 indicates that the data are highly skewed.


- Kurtosis is a measure of whether the data are heavy-tailed or light-tailed relative to a normal distribution. That is, data sets with high kurtosis tend to have heavy tails, or outliers. Data sets with low kurtosis tend to have light tails, or lack of outliers. A uniform distribution would be the extreme case.
    * A kurtosis value greater than 3 indicates a heavy-tailed distribution. In this case, there are more outliers in the distribution.
    * A kurtosis value less than 3 indicates a light-tailed distribution. In this case, there are fewer outliers in the distribution.
    * A kurtosis value of exactly 3 indicates that the distribution is mesokurtic. This means that the distribution is neither light-tailed nor heavy-tailed.


- QQ-plot is a probability plot, which is a graphical method for comparing two probability distributions by plotting their quantiles against each other. If the two distributions being compared are identical, the resulting points should fall approximately on a line.
    * If the two distributions are not identical, the resulting points will not fall on a line. The further away the points are from the line, the more different the two distributions are.
    * The QQ-plot is a graphical method for comparing two probability distributions by plotting their quantiles against each other. If the two distributions being compared are identical, the resulting points should fall approximately on a line.
    * If the two distributions are not identical, the resulting points will not fall on a line. The further away the points are from the line, the more different the two distributions are.

