In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<div style="
    background-color: #0E8554; 
    color: white; 
    padding: 15px; 
    border-radius: 10px; 
    text-align: center; 
    font-size: 28px; 
    font-weight: bold;
">
    📊 Medical Insurance Cost Data Analysis
</div>


![Medical field ](https://img.freepik.com/premium-photo/healthcare-medical-insurance-business-virtual-graph-data-growth-with-medical-financial_103164-1162.jpg?w=2000)

### -This notebook performs an **Exploratory data analysis (EDA)** on the Medical Insurance Dataset.  
  
### -The goal is to understand **factors affecting medical insurance costs** such as:
   - Age
   - Sex
   - BMI (Body Mass Index)
   - Smoking habits
   - Number of children
   - Region
### ------------------------
### ✨ Executive Summary
   - Smokers pay **much higher charges** than non-smokers.  
   - Charges **increase with age** (especially after 40).  
   - Higher BMI (obesity) is associated with **increased costs**.  
   - Region has a **minor effect** compared to smoking and age.  
   - The top 10% spenders are **mostly smokers**.  
 
### 👉 These insights are useful for **insurance companies** (for premium setting) and **policy makers** (for health awareness campaigns).  


<div style="
    background-color: #0E8554; 
    color: white; 
    padding: 15px; 
    border-radius: 10px; 
    text-align: center; 
    font-size: 28px; 
    font-weight: bold;
">
    1. Import Libraries and Load Datase
    
</div>


In [None]:
# We start by importing the necessary Python libraries for data analysis.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Set styles
sns.set(style="whitegrid")
pd.set_option("display.max_columns", 200)

# Load dataset
df = pd.read_csv(r"/kaggle/input/medical-insurance-cost-dataset/Medical-Insurance.csv")

print("Dataset shape:", df.shape)
df.head(10)


<div style="
    background-color: #0E8554; 
    color: white; 
    padding: 15px; 
    border-radius: 10px; 
    text-align: center; 
    font-size: 28px; 
    font-weight: bold;
">
    2. Basic Data Exploration
</div>


### Let's check:  
 - Info about the dataset (columns, datatypes)  
 - Missing values  
 - Summary statistics for numerical columns  
 - Value counts for categorical columns 

In [None]:
print("\n--- Info ---")
df.info()

In [None]:
print("\n--- Missing values per column ---")
df.isnull().sum()

In [None]:
print("\n--- Numeric summary ---")
df.describe()

In [None]:
column_names = ["age", "sex", "bmi", "children", "smoker", "region", "charges"]
df = pd.read_csv(r"/kaggle/input/medical-insurance-cost-dataset/Medical-Insurance.csv", header=None, names=column_names)

In [None]:
df.head(10)

In [None]:
df.columns

In [None]:
print("\n--- Categorical counts ---")
for col in ["sex", "smoker", "region", "children"]:
    if col in df.columns:
        print(f"\n{col} value counts:")
        print(df[col].value_counts())

In [None]:
df.groupby('smoker')['charges'].mean()
df.groupby(['sex','smoker'])['charges'].median()

<div style="
    background-color: #0E8554; 
    color: white; 
    padding: 15px; 
    border-radius: 10px; 
    text-align: center; 
    font-size: 28px; 
    font-weight: bold;
">
    3. Data Transformation & Encoding
</div>

#### We create new variables to make analysis more meaningful:
 - **Age Groups**: 18–25, 26–35, …  
 - **BMI Categories**: Underweight, Normal, Overweight, Obese  
 - **BMI × Smoker Interaction**: helps measure combined effect  

In [None]:
# Age groups
bins = [17, 25, 35, 45, 55, 65]
labels = ["18-25", "26-35", "36-45", "46-55", "56-65"]
df["age_group"] = pd.to_numeric(df["age"], errors='coerce')

# BMI categories
def bmi_category(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif bmi < 25:
        return "Normal"
    elif bmi < 30:
        return "Overweight"
    else:
        return "Obese"

df["bmi_category"] = df["bmi"].apply(bmi_category)

# BMI × Smoker interaction
df["bmi_smoker_interaction"] = df["bmi"] * (df["smoker"] == "yes").astype(int)

In [None]:
df.head(10)

In [None]:
df.tail(10)

<div style="
    background-color: #0E8554; 
    color: white; 
    padding: 15px; 
    border-radius: 10px; 
    text-align: center; 
    font-size: 28px; 
    font-weight: bold;
">
    4. Data Visualization
</div>


#### In this section we explore relationships between variables using plots:
 - Distribution of charges  
 - Charges by smoker & sex  
 - Age vs Charges  


In [None]:
# Distribution of charges
plt.figure(figsize=(8,4))
sns.histplot(df["charges"], kde=True, bins=30, color="blue")
plt.title("Distribution of Charges")
plt.xlabel("Charges")
plt.ylabel("Count")
plt.show()

In [None]:
# Charges by smoker & sex
plt.figure(figsize=(10,6))
sns.violinplot(x="smoker", y="charges", hue="sex", data=df, split=True, palette="Set2")
plt.title("Charges by Smoker & Sex")
plt.show()

In [None]:
 # BMI vs charges
plt.figure(figsize=(8,5))
sns.scatterplot(x="bmi", y="charges", hue="smoker", data=df, palette="coolwarm")
plt.title("BMI vs Charges (Smoker Highlighted)")
plt.show()

In [None]:
# Pairplot (quick overview of numerical variables)
sns.pairplot(df[["age", "bmi", "children", "charges"]], diag_kind="kde")
plt.suptitle("Pairwise Relationships of Numerical Variables", y=1.02)
plt.show()

In [None]:
df.head()

<div style="
    background-color: #0E8554; 
    color: white; 
    padding: 15px; 
    border-radius: 10px; 
    text-align: center; 
    font-size: 28px; 
    font-weight: bold;
">
    5. Exploratory Data Analysis
</div>


In [None]:
# Distribution of Charges
plt.figure(figsize=(8,5))
sns.histplot(df["charges"], bins=40, kde=True)
plt.title("Distribution of Medical Charges")
plt.show()

In [None]:
# Charges vs Smoker
plt.figure(figsize=(7,5))
sns.boxplot(x="smoker", y="charges", data=df)
plt.title("Charges by Smoker Status")
plt.show()

## 🔹 Charges by Smoker Status

This **boxplot** compares medical insurance **charges** between **smokers** and **non-smokers**

> Insights:
> - Smokers have significantly **higher medical charges** compared to non-smokers.
> - This indicates that **smoking is a major factor** affecting medical insurance costs.


In [None]:
# Calculate mean charges per BMI category and smoker status
bmi_smoker_charges = df.groupby(['bmi_category', 'smoker'])['charges'].mean().reset_index()

# Bar plot
sns.barplot(x='bmi_category', y='charges', hue='smoker', data=bmi_smoker_charges, palette='magma')
plt.title("Average Charges by BMI Category & Smoker")
plt.ylabel("Average Charges")
plt.xlabel("BMI Category")
plt.show()


In [None]:
from scipy import stats

charges_smokers = df[df["smoker"] == "yes"]["charges"].dropna()
charges_non = df[df["smoker"] == "no"]["charges"].dropna()

print("Smoker count:", len(charges_smokers))
print("Non-smoker count:", len(charges_non))

if len(charges_smokers) > 1 and len(charges_non) > 1:
    t_stat, p_val = stats.ttest_ind(charges_smokers, charges_non, equal_var=False)
    print("T-statistic:", t_stat, "P-value:", p_val)

    if p_val < 0.05:
        print("The difference is statistically significant at 5% level.")
    else:
        print("Not statistically significant.")
else:
    print("Not enough data for t-test.")


In [None]:
# Charges vs BMI Category
plt.figure(figsize=(7,5))
sns.boxplot(x="bmi_category", y="charges", data=df, order=["Underweight","Normal","Overweight","Obese"])
plt.title("Charges by BMI Category")
plt.show()

## 🔹 Charges by BMI Category

This **boxplot** visualizes the distribution of medical insurance **charges** across different **BMI categories**:

- **X-axis**: BMI categories (`Underweight`, `Normal`, `Overweight`, `Obese`).  
- **Y-axis**: Medical insurance charges.  
- **Boxplot** highlights:
  - Median charges (line inside the box)
  - Interquartile range (box)
  - Outliers (points outside the whiskers)

> Insights:
> - Individuals in higher BMI categories (Overweight and Obese) tend to have **higher medical charges**.
> - This shows a clear trend where BMI positively correlates with insurance costs, indicating potential health risk impact.


In [None]:
# Average Charges by Region
plt.figure(figsize=(7,5))
sns.barplot(x="region", y="charges", data=df, estimator=np.mean)
plt.title("Average Charges by Region")
plt.show()



In [None]:
print(df.head())


In [None]:
print(df['age_group'].unique())

In [None]:
print(df['charges'].head())


In [None]:
# Charges vs Age Group

# Clean 'age' column
# Convert age to numbers; invalid entries become NaN
df['age'] = pd.to_numeric(df['age'], errors='coerce')
# Remove rows where age is missing
df = df.dropna(subset=['age'])

# Clean 'charges' column
# Convert charges to numbers; invalid entries become NaN
df['charges'] = pd.to_numeric(df['charges'], errors='coerce')
# Remove rows where charges are missing
df = df.dropna(subset=['charges'])

# Create age groups
bins = [0, 18, 25, 35, 45, 55, 65, 100]  # ranges for age
labels = ['0-18', '18-25', '26-35', '36-45', '46-55', '56-65', '66+']  # labels
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

# Plot Average Charges by Age Group
plt.figure(figsize=(10,5))
sns.barplot(
    x='age_group',
    y='charges',
    data=df,
    estimator=np.mean,   # calculate average charges
    errorbar=None         # remove confidence interval lines
)
plt.title("Average Charges by Age Group")
plt.xlabel("Age Group")
plt.ylabel("Average Charges")
plt.show()

In [None]:
print(df["age_group"].value_counts())

In [None]:
plt.figure(figsize=(8,5))
sns.scatterplot(x="age", y="charges", data=df, hue="smoker")
plt.title("Age vs Charges (Smokers vs Non-Smokers)")
plt.show()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(7,5))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

## 🔹 Correlation Heatmap

A **correlation heatmap** helps visualize the relationships between numerical features in the dataset.  
- Values range from **-1 to 1**, where:
  - `1` indicates a perfect positive correlation,
  - `-1` indicates a perfect negative correlation,
  - `0` indicates no correlation.  
- This heatmap uses the **"coolwarm"** color map to distinguish positive and negative correlations clearly.  
- **Annotations (`annot=True`)** display the correlation values for easy interpretation.  

> From this visualization, we can identify which features are strongly related to **medical charges** and to each other.


<div style="
    background-color: #0E8554; 
    color: white; 
    padding: 15px; 
    border-radius: 10px; 
    text-align: center; 
    font-size: 28px; 
    font-weight: bold;
">
    📈 Insights & Recommendations
</div>


### 🔹 Key Insights
1. **Smoking Status**
   - Smokers incur significantly higher medical charges compared to non-smokers.
   - The difference in charges is statistically significant, highlighting the health risk and financial impact of smoking.

2. **Age Factor**
   - Older individuals generally have higher medical charges.
   - Charges tend to increase steadily across age groups, reflecting higher health risks with age.

3. **BMI Influence**
   - Individuals in higher BMI categories (Overweight & Obese) tend to have higher medical charges.
   - The combination of high BMI and smoking leads to the highest charges.

4. **Sex Differences**
   - Minimal difference in charges between males and females when controlling for other factors like age, BMI, and smoking.
   
5. **Regional Variations**
   - Certain regions show slightly higher average charges, indicating possible regional healthcare cost differences.

### 🔹 Recommendations
1. **Preventive Health Programs**
   - Promote smoking cessation programs to reduce health risks and insurance costs.
   - Encourage healthy lifestyle interventions to manage BMI and reduce obesity-related costs.

2. **Age-Specific Insurance Plans**
   - Design insurance plans with age-based pricing to fairly reflect risk and encourage early preventive care.

3. **Targeted Awareness Campaigns**
   - Focus on regions with higher average charges to educate individuals about healthy habits and preventive measures.

4. **Regular Data Monitoring**
   - Continuously monitor medical charges across demographics to identify trends and implement data-driven health policies.

---

> 💡 These insights can help insurance providers, healthcare planners, and policymakers make informed decisions and reduce overall medical costs while improving public health outcomes.


<div style="
    background-color: #0E8554; 
    color: white; 
    padding: 15px; 
    border-radius: 10px; 
    text-align: center; 
    font-size: 28px; 
    font-weight: bold;
">
    🏁 Conclusion
</div>


## 🏁 Conclusion

- This analysis provided a detailed exploration of **medical insurance charges** and the factors that influence them, such as **age, BMI, smoking status, sex, and region**.  
- Key findings indicate that **smoking and higher BMI** significantly increase medical charges, while age also shows a clear positive trend with costs.  
- Regional differences and sex showed minor variations, but overall, lifestyle factors like **smoking and obesity** are the strongest drivers of insurance costs.  
- These insights can guide **insurance providers, healthcare policymakers, and individuals** in making data-driven decisions for better health outcomes and cost management.  



<div style="
    background-color: #0E8554; 
    color: white; 
    padding: 15px; 
    border-radius: 10px; 
    text-align: center; 
    font-size: 28px; 
    font-weight: bold;
">
    ✨ Thank You

</div>



-Thank you for visiting this project!  
- I hope this analysis provides valuable insights into **medical insurance cost trends**.  
- Feedback, suggestions, and collaborations are always welcome.  

> 💡 Feel free to explore the notebook and dataset for deeper analysis and visualization enhancements.