In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_kmo

In [None]:
survey_df = pd.read_csv('Survey.csv')
sur_int = survey_df.iloc[:, 19:46]

In [None]:
scaler = StandardScaler()
sur_int_std = scaler.fit_transform(sur_int)

In [None]:
numeric_cols = survey_df.select_dtypes(include=[np.number]).columns
sur_int = survey_df[numeric_cols]

In [None]:
scaler = StandardScaler()
sur_int_std = scaler.fit_transform(sur_int)

PRINCIPAL COMPONENT ANALYSIS

In [None]:
pca = PCA()
pca_result = pca.fit_transform(sur_int_std)

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_, marker='o')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.title('Scree Plot')
plt.show()

In [None]:
kmo_all, kmo_model = calculate_kmo(sur_int)
print(f"KMO Measure: {kmo_model:.3f}")

In [None]:
fa = FactorAnalyzer(n_factors=4, rotation='varimax')
fa.fit(sur_int_std)

loadings = pd.DataFrame(fa.loadings_, index=sur_int.columns,
                       columns=['Factor1', 'Factor2', 'Factor3', 'Factor4'])
print("\nFactor Loadings:")
print(loadings)

communalities = pd.DataFrame(fa.get_communalities(), index=sur_int.columns,
                           columns=['Communalities'])
print("\nCommunalities:")
print(communalities)

CLUSTER ANALYSIS

In [None]:
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=123)
    kmeans.fit(sur_int_std)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.title('Elbow Method')
plt.show()

# K-means clustering with 4 clusters
kmeans = KMeans(n_clusters=4, init='k-means++', random_state=123)
survey_df['cluster'] = kmeans.fit_predict(sur_int_std)

# Hierarchical clustering
plt.figure(figsize=(12, 7))
dendrogram(linkage(sur_int_std, method='ward'),
           truncate_mode='lastp', p=12)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample index')
plt.ylabel('Distance')
plt.show()

# Cluster interpretation - mean values for each cluster
cluster_means = survey_df.groupby('cluster')[numeric_cols].mean()
print("\nCluster Profiles (Mean Values):")
print(cluster_means)

# Count of respondents in each cluster
print("\nNumber of respondents per cluster:")
print(survey_df['cluster'].value_counts().sort_index())