Download dos dados disponíveis no link do github: https://github.com/reisaraujo-miguel/trab2-ia-censo-ed-2023/tree/main


In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("microdados-clean.csv", encoding='latin1', sep=';')
df

In [None]:
df.info()

In [None]:
df['QT_SALAS_UTILIZADAS']

# Pré-processamento dos dados

Precisamos escalar os dados quantitativos entre 0 e 1, já que os dados são um misto de dados binários e escalares. Para isso vamos usar o MinMaxScaler do scikit-learn, que utiliza a seguinte equação:

X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

X_scaled = X_std * (max - min) + min

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
# scale quantitive data between 0-1

scaler = MinMaxScaler()

qt_columns = [col for col in df.columns if col.startswith("QT")]

#df_scaled = df

for column in qt_columns:
    df[column] = scaler.fit_transform(df[[column]])

qt_columns

In [None]:
df['QT_SALAS_UTILIZADAS']

# K-Prototypes

In [None]:
binary_columns = [i for i, col in enumerate(df.columns) if not col.startswith("QT")]

binary_columns

# Método do cotovelo para K-Prototypes

In [None]:
from kmodes.kprototypes import KPrototypes
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

In [None]:
costs = []
K = range(2, 10)

for k in K:
    kp = KPrototypes(n_clusters=k, init='Cao', n_init=5, n_jobs=8)
    data_array = df.values

    kp.fit_predict(data_array, categorical=binary_columns)
    costs.append(kp.cost_)

plt.plot(K, costs, marker='o')
plt.xlabel('Número de clusters K')
plt.ylabel('Custo (dissimilarity)')
plt.title('Método do cotovelo para K-Prototypes')
plt.show()

# 3 clusters

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np

In [None]:
kp = KPrototypes(n_clusters=3, init='Cao', verbose=1, n_jobs=8)

data_array = df.values

clusters = kp.fit_predict(data_array, categorical=binary_columns)

df['CLUSTER_LABELS'] = clusters

In [None]:
pca = PCA(n_components=2)
principal_components = pca.fit_transform(data_array)

pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
pca_df['Cluster'] = clusters

plt.figure(figsize=(10, 6))

unique_clusters = np.unique(clusters)

for cluster_id in unique_clusters:
    subset = pca_df[pca_df['Cluster'] == cluster_id]
    
    plt.scatter(
        subset['PC1'], 
        subset['PC2'],  
        label=f'Cluster {cluster_id}', 
        alpha=0.7,
        s=100
    )

plt.title('Clusters da Infraestrutura das Escolas (PCA)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title="Grupos")
plt.grid(True, linestyle='--', alpha=0.5)

plt.savefig('pca-plot-3clusters.png', bbox_inches='tight')
plt.show()

In [None]:
profile = df.groupby('CLUSTER_LABELS')[[col for col in df.columns if not col == 'CLUSTER_LABELS']].mean()

profile.plot(kind='bar', figsize=(10, 5))
plt.title('Composição de Infraestrutura por Cluster')
plt.ylabel('Porcentagem das Escolas (0.0 à 1.0)')
plt.xticks(rotation=0)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') # Move legend outside
#plt.tight_layout()
plt.savefig('clusters-plot-3clusters.png', bbox_inches='tight')
plt.show()

# 4 clusters 

In [None]:
kp = KPrototypes(n_clusters=4, init='Cao', verbose=1, n_jobs=4)

data_array = df.values

clusters = kp.fit_predict(data_array, categorical=binary_columns)

df['CLUSTER_LABELS'] = clusters

In [None]:
pca = PCA(n_components=2)
principal_components = pca.fit_transform(data_array)

pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
pca_df['Cluster'] = clusters

plt.figure(figsize=(10, 6))

unique_clusters = np.unique(clusters)

for cluster_id in unique_clusters:
    subset = pca_df[pca_df['Cluster'] == cluster_id]
    
    plt.scatter(
        subset['PC1'], 
        subset['PC2'],  
        label=f'Cluster {cluster_id}', 
        alpha=0.7,
        s=100
    )

plt.title('Clusters da Infraestrutura das Escolas (PCA)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title="Grupos")
plt.grid(True, linestyle='--', alpha=0.5)

plt.savefig('pca-plot-4clusters.png', bbox_inches='tight')
plt.show()

In [None]:
profile = df.groupby('CLUSTER_LABELS')[[col for col in df.columns if not col == 'CLUSTER_LABELS']].mean()

profile.plot(kind='bar', figsize=(10, 5))
plt.title('Composição de Infraestrutura por Cluster')
plt.ylabel('Porcentagem das Escolas (0.0 à 1.0)')
plt.xticks(rotation=0)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') # Move legend outside
#plt.tight_layout()
plt.savefig('clusters-plot-4clusters.png', bbox_inches='tight')
plt.show()

# 5 clusters

In [None]:
kp = KPrototypes(n_clusters=5, init='Cao', verbose=1, n_jobs=4)

data_array = df.values

clusters = kp.fit_predict(data_array, categorical=binary_columns)

df['CLUSTER_LABELS'] = clusters

In [None]:
pca = PCA(n_components=2)
principal_components = pca.fit_transform(data_array)

pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
pca_df['Cluster'] = clusters

plt.figure(figsize=(10, 6))

unique_clusters = np.unique(clusters)

for cluster_id in unique_clusters:
    subset = pca_df[pca_df['Cluster'] == cluster_id]
    
    plt.scatter(
        subset['PC1'], 
        subset['PC2'],  
        label=f'Cluster {cluster_id}', 
        alpha=0.7,
        s=100
    )

plt.title('Clusters da Infraestrutura das Escolas (PCA)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title="Grupos")
plt.grid(True, linestyle='--', alpha=0.5)

plt.savefig('pca-plot-5clusters.png', bbox_inches='tight')
plt.show()

In [None]:
profile = df.groupby('CLUSTER_LABELS')[[col for col in df.columns if not col == 'CLUSTER_LABELS']].mean()

profile.plot(kind='bar', figsize=(10, 5))
plt.title('Composição de Infraestrutura por Cluster')
plt.ylabel('Porcentagem das Escolas (0.0 à 1.0)')
plt.xticks(rotation=0)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') # Move legend outside
#plt.tight_layout()
plt.savefig('clusters-plot-5clusters.png', bbox_inches='tight')
plt.show()