<a href="https://colab.research.google.com/github/shihab005963/GDP_per_capita/blob/main/GDP_per_capita.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files

# Load the dataset from Google Drive
file_path = '/content/drive/MyDrive/ColabDatasets/data.csv'  # Adjust the path if needed
df = pd.read_csv(file_path)


def preprocessing(df):
    """
    Preprocess the data: handle missing values, duplicates, and explore the dataset.
    """
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)

    print("Dataset Info:")
    print(df.info())
    print("\nDataset Description:")
    print(df.describe())

    # **Fix: Select only numeric columns for correlation matrix**
    numeric_df = df.select_dtypes(include=['number'])  # ✅ Keep only numeric columns

    # **Check if numeric data exists before proceeding**
    if numeric_df.empty:
        print("No numeric data available for correlation matrix.")
    else:
        print("\nCorrelation Matrix:")
        print(numeric_df.corr())  # ✅ Now correlation works with numeric data only

    return df


def plot_relational_plot(df):
    """
    Create a relational plot (e.g., scatter plot).
    """
    fig, ax = plt.subplots(figsize=(8, 6))
    numeric_df = df.select_dtypes(include=['number'])  # Use only numeric columns
    if numeric_df.shape[1] < 2:
        print("Not enough numeric columns for relational plot.")
        return

    sns.scatterplot(x=numeric_df.columns[0], y=numeric_df.columns[1], data=df, ax=ax)
    ax.set_title(f'Relational Plot: {numeric_df.columns[0]} vs {numeric_df.columns[1]}')
    ax.set_xlabel(numeric_df.columns[0])
    ax.set_ylabel(numeric_df.columns[1])
    plt.savefig('relational_plot.png')
    plt.close()


def plot_categorical_plot(df):
    """
    Create a categorical plot (e.g., bar plot).
    """
    fig, ax = plt.subplots(figsize=(8, 6))
    numeric_df = df.select_dtypes(include=['number'])  # Use only numeric columns
    if numeric_df.shape[1] < 2:
        print("Not enough numeric columns for categorical plot.")
        return

    sns.barplot(x=numeric_df.columns[0], y=numeric_df.columns[1], data=df, ax=ax)
    ax.set_title(f'Categorical Plot: {numeric_df.columns[0]} vs {numeric_df.columns[1]}')
    ax.set_xlabel(numeric_df.columns[0])
    ax.set_ylabel(numeric_df.columns[1])
    plt.savefig('categorical_plot.png')
    plt.close()


def plot_statistical_plot(df):
    """
    Create a statistical plot (e.g., correlation heatmap).
    """
    fig, ax = plt.subplots(figsize=(8, 6))
    numeric_df = df.select_dtypes(include=['number'])  # Use only numeric columns
    if numeric_df.empty:
        print("No numeric data available for heatmap.")
        return

    sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', ax=ax)
    ax.set_title('Statistical Plot: Correlation Heatmap')
    plt.savefig('statistical_plot.png')
    plt.close()


def statistical_analysis(df, col: str):
    """
    Calculate the 4 main statistical moments for a given column.
    """
    mean = df[col].mean()
    stddev = df[col].std()
    skew = df[col].skew()
    excess_kurtosis = df[col].kurtosis()
    return mean, stddev, skew, excess_kurtosis


def writing(moments, col):
    """
    Print the statistical moments and interpret skewness and kurtosis.
    """
    print(f'For the attribute {col}:')
    print(f'Mean = {moments[0]:.2f}, '
          f'Standard Deviation = {moments[1]:.2f}, '
          f'Skewness = {moments[2]:.2f}, '
          f'Excess Kurtosis = {moments[3]:.2f}.')

    if moments[2] > 0.5:
        skewness = 'right'
    elif moments[2] < -0.5:
        skewness = 'left'
    else:
        skewness = 'not'

    if moments[3] > 0:
        kurtosis = 'leptokurtic'
    elif moments[3] < 0:
        kurtosis = 'platykurtic'
    else:
        kurtosis = 'mesokurtic'

    print(f'The data was {skewness} skewed and {kurtosis}.')
    return


def main():
    global df
    # Preprocess the data
    df = preprocessing(df)

    # Select only numeric columns
    numeric_df = df.select_dtypes(include=['number'])
    if numeric_df.empty:
        print("No numeric columns available for analysis.")
        return

    # Choose a column for statistical analysis
    col = numeric_df.columns[0]  # Replace with your chosen column

    # Generate plots
    plot_relational_plot(df)
    plot_categorical_plot(df)
    plot_statistical_plot(df)

    # Perform statistical analysis
    moments = statistical_analysis(numeric_df, col)

    # Write and interpret the results
    writing(moments, col)

    # Save the processed dataset
    processed_file_path = '/content/drive/MyDrive/ColabDatasets/processed_data.csv'
    df.to_csv(processed_file_path, index=False)
    print(f"Processed data saved at: {processed_file_path}")

    # Download the processed file
    df.to_csv('processed_data.csv', index=False)
    files.download('processed_data.csv')


# Run the main function
if __name__ == '__main__':
    main()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 4410 entries, 256 to 13758
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   country_code         4410 non-null   object 
 1   country_name         4410 non-null   object 
 2   region               4410 non-null   object 
 3   sub_region           4410 non-null   object 
 4   intermediate_region  4410 non-null   object 
 5   indicator_code       4410 non-null   object 
 6   indicator_name       4410 non-null   object 
 7   year                 4410 non-null   int64  
 8   gdp_per_capita       4410 non-null   float64
 9   gdp_variation        4410 non-null   float64
dtypes: float64(2), int64(1), object(7)
memory usage: 379.0+ KB
None

Dataset Description:
              year  gdp_per_capita  gdp_variation
count  4410

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>