In [None]:
# The following line is crucial for displaying plots in Jupyter Notebook
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import warnings

# Suppress warnings to keep the output clean
warnings.filterwarnings('ignore')

# # Big Data Analytics - Capstone Project
#
# ## Part 2: Python Analytics Tasks
#
# This notebook performs the following tasks on the provided dataset:
# 1.  Data Cleaning and Preprocessing
# 2.  Exploratory Data Analysis (EDA)
# 3.  Application of a Clustering Model (K-Means)
# 4.  Model Evaluation and Visualization
# 5.  Incorporating an Innovative Method (Elbow Method for optimal K)

def clean_data(df):
    """
    Cleans and preprocesses the raw dataset.

    Args:
        df (pd.DataFrame): The raw DataFrame.

    Returns:
        pd.DataFrame: The cleaned and preprocessed DataFrame.
    """
    cols_to_drop = [
        'STRUCTURE', 'STRUCTURE_ID', 'STRUCTURE_NAME', 'ACTION', 'FREQ',
        'MEASURE', 'UNIT_MEASURE', 'METHODOLOGY', 'OBS_STATUS', 'UNIT_MULT',
        'DECIMALS'
    ]
    
    existing_cols_to_drop = [col for col in cols_to_drop if col in df.columns]
    df = df.drop(columns=existing_cols_to_drop, errors='ignore')

    rename_dict = {
        'REF_AREA': 'Country',
        'TIME_PERIOD': 'Year',
        'OBS_VALUE': 'Daily_Smokers_Percentage'
    }
    df = df.rename(columns=rename_dict)
    
    df = df[df['Year'].notna()]
    df = df.dropna(subset=['Year', 'Daily_Smokers_Percentage'])

    df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
    df['Daily_Smokers_Percentage'] = pd.to_numeric(df['Daily_Smokers_Percentage'], errors='coerce')

    df = df.dropna(subset=['Year', 'Daily_Smokers_Percentage'])
    
    df.to_csv('cleaned_data.csv', index=False)

    print(f"DataFrame shape after cleaning: {df.shape}")
    print("Data Cleaning Complete. Cleaned data saved to 'cleaned_data.csv'.")
    return df

def perform_eda(df):
    """
    Performs Exploratory Data Analysis and visualizes the data.
    """
    print("Descriptive Statistics:")
    print(df.describe())
    
    plt.figure(figsize=(15, 8))
    sns.lineplot(data=df, x='Year', y='Daily_Smokers_Percentage', hue='Country', marker='o')
    plt.title('Daily Smokers Percentage Over Time by Country')
    plt.xlabel('Year')
    plt.ylabel('Daily Smokers Percentage')
    plt.legend(title='Country', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    plt.show()

    plt.figure(figsize=(15, 8))
    sns.boxplot(data=df, x='Daily_Smokers_Percentage')
    plt.title('Distribution of Daily Smokers Percentage')
    plt.xlabel('Daily Smokers Percentage')
    plt.grid(True)
    plt.show()
    
    print("EDA Complete.")

def apply_model(df):
    """
    Applies K-Means clustering to group countries with similar smoking trends.
    """
    df_pivot = df.pivot_table(index='Country', columns='Year', values='Daily_Smokers_Percentage')
    df_pivot_filled = df_pivot.apply(lambda row: row.interpolate(method='linear', limit_direction='both'), axis=1)
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(df_pivot_filled)
    
    inertias = []
    if scaled_features.shape[0] > 1:
        for k in range(2, min(11, scaled_features.shape[0] + 1)):
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            kmeans.fit(scaled_features)
            inertias.append(kmeans.inertia_)

        plt.figure(figsize=(10, 6))
        plt.plot(range(2, min(11, scaled_features.shape[0] + 1)), inertias, marker='o')
        plt.title('Elbow Method for Optimal K')
        plt.xlabel('Number of Clusters (K)')
        plt.ylabel('Inertia')
        plt.show()

    if scaled_features.shape[0] >= 3:
        optimal_k = 3
        kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
        clusters = kmeans.fit_predict(scaled_features)
        
        df_pivot_filled['Cluster'] = clusters
        df_pivot_with_clusters = df_pivot_filled.reset_index()[['Country', 'Cluster']]
        
        df_clustered = df.merge(df_pivot_with_clusters, on='Country', how='left')
        
        df_clustered.to_csv('clustered_data.csv', index=False)
        
        print(f"K-Means Clustering with K={optimal_k} applied. Countries grouped into clusters.")
        return df_clustered
    else:
        print("Not enough data to perform clustering. Returning original DataFrame.")
        return df

def evaluate_model(df):
    """
    Evaluates the clustering model and visualizes the results.
    """
    if 'Cluster' not in df.columns:
        print("No clustering performed. Cannot evaluate the model.")
        return
    
    df_pivot = df.pivot_table(index='Country', columns='Year', values='Daily_Smokers_Percentage')
    df_pivot_filled = df_pivot.apply(lambda row: row.interpolate(method='linear', limit_direction='both'), axis=1)
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(df_pivot_filled)
    
    labels = df.drop_duplicates(subset=['Country'])['Cluster']
    
    score = silhouette_score(scaled_features, labels.sort_index())
    print(f"Silhouette Score for the clustering: {score:.2f}")

    plt.figure(figsize=(15, 8))
    sns.lineplot(data=df, x='Year', y='Daily_Smokers_Percentage', hue='Cluster', palette='viridis', marker='o')
    plt.title('Daily Smokers Percentage Trends by Cluster')
    plt.xlabel('Year')
    plt.ylabel('Daily Smokers Percentage')
    plt.legend(title='Cluster')
    plt.grid(True)
    plt.show()
    
    print("Model Evaluation Complete.")

# Main execution block
if __name__ == "__main__":
    file_path = 'OECD.ELS.HD,DSD_HEALTH_LVNG@DF_HEALTH_LVNG_TC,1.0+.A......csv'
    try:
        raw_df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found. Please ensure it's in the correct directory.")
        exit()
        
    cleaned_df = clean_data(raw_df)
    
    if not cleaned_df.empty:
        perform_eda(cleaned_df)
        clustered_df = apply_model(cleaned_df)
        evaluate_model(clustered_df)
        print("\nProject completed successfully!")
    else:
        print("Data cleaning resulted in an empty DataFrame. Cannot proceed with analysis.")