# World Population (EDA)

The folllowing is the list of possible EDA steps to prepare data for training a ML model. In this notebook, I address some of them.

## Steps to do:
1. Imports
2. Load data
3. Know/understand data
  
  3.1. df.shape

  3.2. df.head(), df.tail(), df.sample(5)

  3.3. df.info()

  3.4. df.describe(),

  3.5. df.select_dtype(include/exclude="number"/"categorical")

  3.6. Feature analysis: Correlation, Mutual Information, Statistical tests.

4. Remove duplicates
5. Null handling
6. Skewness handling for numerical data
7. Outlier Handling
8. Filter data for analysis
9. Scale the numeric data
10. Encode the categorical data
11. Feature engineering
  
  11.1. Dimensionality reduction (PCA, tSNE, UMAP)
  
  11.2. Manual Featuring

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler


In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/AlexTheAnalyst/PandasYouTubeSeries/main/world_population.csv')

In [None]:
df.shape

In [None]:
df.head(5)

In [None]:
df.describe().T # transpose

In [None]:
df.sort_values(by="World Population Percentage", ascending=False).head(10)

In [None]:
# Identify columns with non-numerical data
non_numeric_cols = df.select_dtypes(exclude=['number']).columns
numeric_cols = df.select_dtypes(include=['number']).columns

# Drop or encode non-numerical columns before imputation
df_numeric = df.drop(non_numeric_cols, axis=1)
df_cat = df.drop(numeric_cols, axis=1)

In [None]:
df[numeric_cols].corr()

In [None]:
sns.heatmap(df[numeric_cols].corr(), annot = True)
plt.rcParams['figure.figsize'] = (20,7)
plt.show()

## Remove Duplicate

In [None]:
df.duplicated().sum()
df.drop_duplicates()

## NUll handling

In [None]:
df.isnull().sum()

#-----------------------------------------
df1 = df.dropna()


#-----------------------------------------
from sklearn.impute import KNNImputer
df_numeric2 = KNNImputer().fit_transform(df_numeric)

df_numeric2 = pd.DataFrame(df_numeric2, columns=df_numeric.columns)
df_non_null = pd.concat((df_numeric2, df_cat), axis=1)

## Skewness handing

In [None]:
def visualize_numerical_distributions(df, exclude_columns='id'):

    # Set up the figure for multiple subplots
    num_cols = 3  # Number of columns for the subplot grid
    num_rows = (len(numeric_cols) + num_cols - 1) // num_cols  # Calculate number of rows needed

    fig, axes = plt.subplots(num_rows, num_cols, figsize=(18, 5 * num_rows))
    fig.suptitle('Distribution of Numerical Features', fontsize=16)

    # Flatten axes array for easy iteration
    axes = axes.flatten()

    # Iterate over each numerical column and create a histogram with KDE
    for i, col in enumerate(numeric_cols):
        sns.histplot(df[col], kde=True, ax=axes[i], color="skyblue", element="step", stat="density")
        axes[i].set_title(f'Distribution of {col}', fontsize=14)
        axes[i].set_xlabel(col, fontsize=12)
        axes[i].set_ylabel('Density', fontsize=12)

    # Remove unused axes
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    # Adjust layout
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust the main title space
    plt.show()
    
    
visualize_numerical_distributions(df)

In [None]:
df["Growth Rate"].skew()

In [None]:
from scipy import stats

def robust_skewness_handler(df, threshold=2.0):
    transformed_df = df.copy()
    transformation_dict = {}

    for col in df.select_dtypes(include=[np.number]).columns:
        data = df[col].dropna()
        skewness = data.skew()
        
        if abs(skewness) > threshold:
            print(f"Handling skewness for {col} (skewness: {skewness:.2f})")
            
            # Handle zero-inflated or near-zero-inflated data
            if (data == 0).sum() / len(data) > 0.1:  # If more than 10% zeros
                non_zero = data[data != 0]
                if len(non_zero) > 0:
                    log_transform = np.log1p(non_zero)
                    transformed_df.loc[data != 0, col] = log_transform
                    transformation_dict[col] = ('log1p', 'zero-inflated')
                    print(f"  Applied Log1p to non-zero values for {col}")
                continue
            
            # Try Box-Cox transformation
            min_val = data.min()
            if min_val <= 0:
                shift = abs(min_val) + 1
                shifted_data = data + shift
            else:
                shifted_data = data
            
            try:
                transformed_data, lambda_param = stats.boxcox(shifted_data)
                transformed_df[col] = transformed_data
                transformation_dict[col] = ('box-cox', lambda_param)
                print(f"  Applied Box-Cox to {col} (lambda: {lambda_param:.2f})")
            except:
                # If Box-Cox fails, try other transformations
                if skewness > 0:  # Right-skewed
                    if data.max() / data.min() > 1000:  # Very large range
                        transformed_df[col] = np.log1p(data)
                        transformation_dict[col] = ('log1p', None)
                        print(f"  Applied Log1p to {col}")
                    else:
                        transformed_df[col] = np.sqrt(data)
                        transformation_dict[col] = ('sqrt', None)
                        print(f"  Applied Square Root to {col}")
                else:  # Left-skewed
                    transformed_df[col] = data ** 2
                    transformation_dict[col] = ('square', None)
                    print(f"  Applied Square to {col}")

    return transformed_df, transformation_dict

# Example usage:
transformed_df, transformations = robust_skewness_handler(df)

In [None]:
visualize_numerical_distributions(transformed_df)

## Outlier handling

In [None]:
plt.figure(figsize=(10,5))
df.boxplot()
plt.xticks(rotation=90)
plt.show()

In [None]:
def calculate_outliers_percentage(df):
    outlier_counts = {}
    for column in df.select_dtypes(include=['number']).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Calculate outliers
        outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
        outlier_counts[column] = len(outliers)

    # Print the percentage of outliers for each column
    for column in outlier_counts:
        percentage = (outlier_counts[column] / len(df)) * 100
        print(f"Percentage of outliers in {column}: {percentage:.2f}%")

# Example usage:
calculate_outliers_percentage(df)

In [None]:
def handle_outliers(df):
    for column in numeric_cols:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Capping
        df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)

    return df

# Apply to both training and test datasets
df = handle_outliers(df)

In [None]:
calculate_outliers_percentage(df)

In [None]:
sns.violinplot(df)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Group by 'Continent' and aggregate population data
population_agg = df.groupby('Continent')['2022 Population'].agg(['sum', 'mean', 'median', 'std']).reset_index()

# Rename columns for clarity
population_agg.columns = ['Continent', 'Total Population', 'Mean Population', 'Median Population', 'Population Std Dev']

# Display the aggregated table
population_agg.sort_values(by='Total Population', ascending=False)