<a href="https://colab.research.google.com/github/sneha5678-cmyk/12-FEB-2025/blob/main/PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install plotly



In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder

# Read the dataset
df = pd.read_csv('/content/pca.csv')

# Create a copy of the dataset for preprocessing
df_pca = df.copy()

# Save country names
countries = df_pca['country']

# Remove the 'country' column for PCA
df_pca = df_pca.drop('country', axis=1)

# Standardize the features
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_pca)

# Perform PCA
pca = PCA()
pca_result = pca.fit_transform(df_scaled)

# Calculate explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

# Create a DataFrame with PCA results
pca_df = pd.DataFrame(data=pca_result, columns=[f'PC{i+1}' for i in range(len(df_pca.columns))])
pca_df['Country'] = countries

# Plot explained variance ratio
fig_variance = px.line(
    x=range(1, len(explained_variance_ratio) + 1),
    y=cumulative_variance_ratio,
    title='Cumulative Explained Variance Ratio',
    labels={'x': 'Number of Components', 'y': 'Cumulative Explained Variance Ratio'}
)
fig_variance.add_hline(y=0.8, line_dash="dash", line_color="red", annotation_text="80% threshold")
fig_variance.show()

# Create 2D scatter plot
fig_2d = px.scatter(
    pca_df, x='PC1', y='PC2',
    hover_data=['Country'],
    title='2D PCA Visualization',
    labels={'PC1': f'First Principal Component ({explained_variance_ratio[0]:.2%} variance)',
            'PC2': f'Second Principal Component ({explained_variance_ratio[1]:.2%} variance)'}
)
fig_2d.show()

# Create 3D scatter plot
fig_3d = px.scatter_3d(
    pca_df, x='PC1', y='PC2', z='PC3',
    hover_data=['Country'],
    title='3D PCA Visualization',
    labels={'PC1': f'First Principal Component ({explained_variance_ratio[0]:.2%} variance)',
            'PC2': f'Second Principal Component ({explained_variance_ratio[1]:.2%} variance)',
            'PC3': f'Third Principal Component ({explained_variance_ratio[2]:.2%} variance)'}
)
fig_3d.show()

# Print feature importance
components_df = pd.DataFrame(
    pca.components_,
    columns=df_pca.columns,
    index=[f'PC{i+1}' for i in range(len(df_pca.columns))]
)
print("\nFeature importance (PCA components):")
print(components_df)

# Print explained variance ratio for each component
print("\nExplained variance ratio for each component:")
for i, ratio in enumerate(explained_variance_ratio):
    print(f"PC{i+1}: {ratio:.3%}")


Feature importance (PCA components):
     child_mort   exports    health   imports    income  inflation  \
PC1   -0.419519  0.283897  0.150838  0.161482  0.398441  -0.193173   
PC2    0.192884  0.613163 -0.243087  0.671821  0.022536  -0.008404   
PC3   -0.029544  0.144761 -0.596632 -0.299927  0.301548   0.642520   
PC4    0.370653  0.003091  0.461897 -0.071907  0.392159   0.150442   
PC5   -0.168970  0.057616  0.518000  0.255376 -0.247150   0.714869   
PC6   -0.200628  0.059333 -0.007276  0.030032 -0.160347  -0.066285   
PC7    0.079489  0.707303  0.249831 -0.592190 -0.095562  -0.104633   
PC8    0.682743  0.014197 -0.072497  0.028946 -0.352624   0.011538   
PC9   -0.327542  0.123082 -0.113088 -0.099037 -0.612982   0.025236   

     life_expec  total_fer      gdpp  
PC1    0.425839  -0.403729  0.392645  
PC2   -0.222707   0.155233 -0.046022  
PC3    0.113919   0.019549  0.122977  
PC4   -0.203797   0.378304  0.531995  
PC5    0.108220  -0.135262 -0.180167  
PC6    0.601127   0.750689 