In [None]:
import csv

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


df = pd.read_csv('/content/drive/MyDrive/finM/transformed_hcf_scores.csv')

categorical_cols = ['Sector', 'Industry']
numeric_cols = ['MarketCap',
                'Direct Management - hcf score',
                'Emotional Connection - hcf score',
                'Engagement - hcf score',
                'Extrinsic - hcf score',
                'Innovation - hcf score',
                'Organizational Alignment - hcf score',
                'Organizational Effectiveness - hcf score']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numeric_cols)
])

k = 5
pipeline = Pipeline([
    ('prep', preprocessor),
    ('cluster', KMeans(n_clusters=k, random_state=42))
])


df = df.dropna(subset=categorical_cols + numeric_cols)
labels = pipeline.fit_predict(df[categorical_cols + numeric_cols])


df['cluster'] = labels


feat_matrix = pipeline.named_steps['prep'].transform(df[categorical_cols + numeric_cols])
sil_score = silhouette_score(feat_matrix, labels)
print(f'Chosen k={k}, silhouette score = {sil_score:.3f}')


best_k, best_score = 0, -1
for k_try in range(2, 11):
    km = KMeans(n_clusters=k_try, random_state=42)
    lbl = km.fit_predict(feat_matrix)
    score = silhouette_score(feat_matrix, lbl)
    if score > best_score:
        best_k, best_score = k_try, score
print(f'Best k by silhouette: {best_k} (score={best_score:.3f})')


In [None]:
print(df['cluster'].value_counts())


In [None]:
cluster_companies = []
for i in range(5):
  cluster_companies.append(df.loc[df['cluster'] == i, ['Code','Name', 'Sector']])

In [None]:
feat_names = pipeline.named_steps['prep'] \
                .get_feature_names_out()

centers = pipeline.named_steps['cluster'].cluster_centers_

import pandas as pd
centroids_df = pd.DataFrame(centers, columns=feat_names)

for i, row in centroids_df.iterrows():
    print(f"\nCluster {i} top features:")
    print(row.abs().sort_values(ascending=False).head(5))

In [None]:
text17_df = pd.read_csv(
    "/content/drive/MyDrive/finM/textData2/gd_sample_sy2017.csv",
    engine="python",
    on_bad_lines="skip"
)


In [None]:
text17_df.columns

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer

numeric_cols = [
    'Helpful Count', 'Not Helpful Count',
    'Rating: Overall', 'Rating: Work/Life Balance',
    'Rating: Culture & Values', 'Rating: Career Opportunities',
    'Rating: Comp & Benefits', 'Rating: Senior Management',
    'Rating: Diversity & Inclusion', 'Length of Employment'
]

categorical_cols = [
    'Sector', 'Industry', 'GICS Sector', 'Exchange', 'Gender'
]

text_cols = [
    'Summary', 'Description', 'PROs', 'CONs', 'Advice to Management'
]

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',  StandardScaler())
])

categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder())
])


def combine_text(X):
    return X[text_cols].fillna("").agg(" ".join, axis=1)

text_transformer = Pipeline([
    ('selector', FunctionTransformer(combine_text, validate=False)),
    ('tfidf', TfidfVectorizer(max_features=5_000, stop_words='english'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols),
    ('txt', text_transformer, text_cols)
], remainder='drop')

pipeline = Pipeline([
    ('preproc', preprocessor),
    ('svd',     TruncatedSVD(n_components=50, random_state=42)),
    ('kmeans',  KMeans(n_clusters=5, random_state=42))
])

cluster_labels = pipeline.fit_predict(text17_df)

text17_df['cluster'] = cluster_labels

print(text17_df['cluster'].value_counts())


In [None]:
t_cluster_companies = []
for i in range(5):
  t_cluster_companies.append(text17_df.loc[text17_df['cluster'] == i, ['Ticker Symbol','Company', 'ICB Sector']])

In [None]:
text17_df["Company"].value_counts()