In [None]:
import csv

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


df = pd.read_csv('./data/transformed_hcf_scores.csv')

categorical_cols = ['Sector', 'Industry']
numeric_cols = ['MarketCap',
                'Direct Management - hcf score',
                'Emotional Connection - hcf score',
                'Engagement - hcf score',
                'Extrinsic - hcf score',
                'Innovation - hcf score',
                'Organizational Alignment - hcf score',
                'Organizational Effectiveness - hcf score']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numeric_cols)
])

k = 5
pipeline = Pipeline([
    ('prep', preprocessor),
    ('cluster', KMeans(n_clusters=k, random_state=42))
])


df = df.dropna(subset=categorical_cols + numeric_cols)
labels = pipeline.fit_predict(df[categorical_cols + numeric_cols])


df['cluster'] = labels


feat_matrix = pipeline.named_steps['prep'].transform(df[categorical_cols + numeric_cols])
sil_score = silhouette_score(feat_matrix, labels)
print(f'Chosen k={k}, silhouette score = {sil_score:.3f}')


best_k, best_score = 0, -1
for k_try in range(2, 11):
    km = KMeans(n_clusters=k_try, random_state=42)
    lbl = km.fit_predict(feat_matrix)
    score = silhouette_score(feat_matrix, lbl)
    if score > best_score:
        best_k, best_score = k_try, score
print(f'Best k by silhouette: {best_k} (score={best_score:.3f})')


In [None]:
print(df['cluster'].value_counts())


In [None]:
cluster_companies = []
for i in range(5):
  cluster_companies.append(df.loc[df['cluster'] == i, ['Code','Name', 'Sector']])

In [None]:
cluster_companies[3]

In [None]:
feat_names = pipeline.named_steps['prep'] \
                .get_feature_names_out()

centers = pipeline.named_steps['cluster'].cluster_centers_

import pandas as pd
centroids_df = pd.DataFrame(centers, columns=feat_names)

for i, row in centroids_df.iterrows():
    print(f"\nCluster {i} top features:")
    print(row.abs().sort_values(ascending=False).head(5))

In [None]:
text17_df = pd.read_csv(
    "./data/gd_sample_sy2017.csv",
    engine="python",
    on_bad_lines="skip"
)


In [None]:
text17_df.columns

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer

numeric_cols = [
    'Helpful Count', 'Not Helpful Count',
    'Rating: Overall', 'Rating: Work/Life Balance',
    'Rating: Culture & Values', 'Rating: Career Opportunities',
    'Rating: Comp & Benefits', 'Rating: Senior Management',
    'Rating: Diversity & Inclusion', 'Length of Employment'
]

categorical_cols = [
    'Sector', 'Industry', 'GICS Sector', 'Exchange', 'Gender'
]

text_cols = [
    'Summary', 'Description', 'PROs', 'CONs', 'Advice to Management'
]

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',  StandardScaler())
])

categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder())
])


def combine_text(X):
    return X[text_cols].fillna("").agg(" ".join, axis=1)

text_transformer = Pipeline([
    ('selector', FunctionTransformer(combine_text, validate=False)),
    ('tfidf', TfidfVectorizer(max_features=5_000, stop_words='english'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols),
    ('txt', text_transformer, text_cols)
], remainder='drop')

pipeline = Pipeline([
    ('preproc', preprocessor),
    ('svd',     TruncatedSVD(n_components=50, random_state=42)),
    ('kmeans',  KMeans(n_clusters=5, random_state=42))
])

cluster_labels = pipeline.fit_predict(text17_df)

text17_df['cluster'] = cluster_labels

print(text17_df['cluster'].value_counts())


In [None]:
t_cluster_companies = []
for i in range(5):
  t_cluster_companies.append(text17_df.loc[text17_df['cluster'] == i, ['Ticker Symbol','Company', 'ICB Sector']])

In [None]:
text17_df["Company"].value_counts()

In [None]:
hcf_df = pd.read_csv('/content/drive/MyDrive/finM/transformed_hcf_scores.csv')

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from scipy.linalg import orthogonal_procrustes
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score

hcf_score_cols = [
    'Direct Management - hcf score',
    'Emotional Connection - hcf score',
    'Engagement - hcf score',
    'Extrinsic - hcf score',
    'Innovation - hcf score',
    'Organizational Alignment - hcf score',
    'Organizational Effectiveness - hcf score'
]
hcf_scores = hcf_df[hcf_score_cols].dropna().copy().reset_index(drop=True)

pca_hcf = PCA(n_components=5, random_state=0)  # adjust n_components as needed
Z_hcf = pca_hcf.fit_transform(hcf_scores)  # shape (n_companies, latent_dim)



text_fields = ['Advice to Management', 'PROs', 'CONs', 'Summary', 'Description']
text17_df['combined_text'] = text17_df[text_fields].fillna('').agg(' '.join, axis=1)

rating_cols = [
    'Rating: Overall', 'Rating: Work/Life Balance', 'Rating: Culture & Values',
    'Rating: Career Opportunities', 'Rating: Comp & Benefits', 'Rating: Senior Management',
    'Rating: Diversity & Inclusion'
]
numeric_cols = rating_cols + ['Fprob_PROs', 'Fprob_CONs', 'Fprob']

review_numeric = text17_df[numeric_cols].fillna(0)


tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words='english')
svd = TruncatedSVD(n_components=50, random_state=0)
text_mat = tfidf.fit_transform(text17_df['combined_text'])
text_latent = svd.fit_transform(text_mat)  # shape (n_reviews, 50)


X_review_base = np.hstack([text_latent, review_numeric.to_numpy()])


pca_review = PCA(n_components=5, random_state=0)
Z_review = pca_review.fit_transform(X_review_base)  # shape (n_reviews, latent_dim)

aux_fields = ['Sector', 'Industry']
hcf_df_aux = hcf_df.loc[hcf_scores.index, aux_fields].fillna('missing').copy()
text17_df_aux = text17_df[aux_fields].fillna('missing').copy()

combined_aux = pd.concat([text17_df_aux, hcf_df_aux], axis=0).reset_index(drop=True)
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe.fit(combined_aux)

X_aux_reviews = ohe.transform(text17_df_aux)    # shape (n_reviews, ...)
X_aux_hcf = ohe.transform(hcf_df_aux)           # shape matches Z_hcf now

reg_aux2hcf = Ridge(alpha=1.0)
reg_aux2hcf.fit(X_aux_hcf, Z_hcf)  # no length mismatch

reg_aux2rev = Ridge(alpha=1.0)
reg_aux2rev.fit(X_aux_reviews, Z_review)  # maps shared aux to review latent

# Pick a set of representative auxiliary vectors to create pseudo-pairs.
# Here: use the unique auxiliary combinations seen in hcf_df (could also cluster if too many)
Z_hcf_from_aux = reg_aux2hcf.predict(X_aux_hcf)    # predicted HCF latent from its own aux
Z_rev_from_aux = reg_aux2rev.predict(X_aux_hcf)    # predicted review latent for same aux

R, scale = orthogonal_procrustes(Z_rev_from_aux, Z_hcf_from_aux)  # orthogonal matrix

def predict_hcf_from_review(review_row):
    text = " ".join([str(review_row.get(f, "")) for f in text_fields])
    text_vec = tfidf.transform([text])
    text_red = svd.transform(text_vec) 
    numeric = np.array([review_row.get(c, 0) for c in numeric_cols]).reshape(1, -1)
    X_base = np.hstack([text_red, numeric]) 
    z_rev = pca_review.transform(X_base)     
    z_hcf_aligned = z_rev @ R
    hcf_pred = pca_hcf.inverse_transform(z_hcf_aligned) 
    return pd.Series(hcf_pred.flatten(), index=hcf_score_cols)



Z_rev_from_aux_aligned = Z_rev_from_aux @ R  
reconstructed_hcf_scores = pca_hcf.inverse_transform(Z_rev_from_aux_aligned)
original_hcf_scores = hcf_scores.values 


rmse = np.sqrt(mean_squared_error(original_hcf_scores, reconstructed_hcf_scores))
print(f"Pseudo alignment RMSE (HCF recon from aux via review path): {rmse:.4f}")

def bootstrap_alignment(n_boot=50):
    rmses = []
    for _ in range(n_boot):
        idx_hcf = np.random.choice(range(X_aux_hcf.shape[0]), size=X_aux_hcf.shape[0], replace=True)
        idx_rev = np.random.choice(range(X_aux_reviews.shape[0]), size=X_aux_reviews.shape[0], replace=True)


        reg_h = Ridge(alpha=1.0).fit(X_aux_hcf[idx_hcf], Z_hcf[idx_hcf])
        reg_r = Ridge(alpha=1.0).fit(X_aux_reviews[idx_rev], Z_review[idx_rev])

        Zh_from_aux = reg_h.predict(X_aux_hcf[idx_hcf])
        Zr_from_aux = reg_r.predict(X_aux_hcf[idx_hcf]) 
        R_boot, _ = orthogonal_procrustes(Zr_from_aux, Zh_from_aux)
        aligned = Zr_from_aux @ R_boot
        recon = pca_hcf.inverse_transform(aligned)
        orig = hcf_scores.values[idx_hcf]
        rmses.append(np.sqrt(mean_squared_error(orig, recon)))
    return np.mean(rmses), np.std(rmses)

mean_rmse, std_rmse = bootstrap_alignment(30)
print(f"Bootstrap pseudo alignment RMSE: {mean_rmse:.4f} ± {std_rmse:.4f}")


In [None]:
import numpy as np
var = np.var(hcf_scores.values, axis=0, ddof=1)
rmse_per_score = np.sqrt(mean_squared_error(original_hcf_scores, reconstructed_hcf_scores, multioutput='raw_values'))
print("Explained fraction per score:", 1 - (rmse_per_score**2) / var)


In [None]:
hcf_score_cols = [
    'Direct Management - hcf score',
    'Emotional Connection - hcf score',
    'Engagement - hcf score',
    'Extrinsic - hcf score',
    'Innovation - hcf score',
    'Organizational Alignment - hcf score',
    'Organizational Effectiveness - hcf score'
]
text_fields = ['Advice to Management', 'PROs', 'CONs', 'Summary', 'Description']
rating_cols = [
    'Rating: Overall', 'Rating: Work/Life Balance', 'Rating: Culture & Values',
    'Rating: Career Opportunities', 'Rating: Comp & Benefits', 'Rating: Senior Management',
    'Rating: Diversity & Inclusion'
]
numeric_cols = rating_cols + ['Fprob_PROs', 'Fprob_CONs', 'Fprob']
aux_fields = ['Sector', 'Industry']

hcf_valid = hcf_df.dropna(subset=hcf_score_cols).copy().reset_index(drop=True)
hcf_scores = hcf_valid[hcf_score_cols].copy()  # (n_hcf, 7)
hcf_df_aux = hcf_valid[aux_fields].fillna('missing').copy()  # aligned with hcf_scores

text17_df['combined_text'] = text17_df[text_fields].fillna('').agg(' '.join, axis=1)
for c in numeric_cols:
    if c not in text17_df.columns:
        text17_df[c] = 0.0
review_numeric = text17_df[numeric_cols].fillna(0)


tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words='english')
svd = TruncatedSVD(n_components=50, random_state=0)
text_mat = tfidf.fit_transform(text17_df['combined_text'])
text_latent = svd.fit_transform(text_mat)  # (n_reviews, 50)

X_review_base = np.hstack([text_latent, review_numeric.to_numpy()])  # (n_reviews, D)


text17_df_aux = text17_df[aux_fields].fillna('missing').copy()
combined_aux = pd.concat([text17_df_aux, hcf_df_aux], axis=0).reset_index(drop=True)
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe.fit(combined_aux)

X_aux_reviews = ohe.transform(text17_df_aux)  # (n_reviews, ...)
X_aux_hcf = ohe.transform(hcf_df_aux)         # (n_hcf, ...), now aligned with hcf_scores


def explained_fraction(original, recon):
    var = np.var(original, axis=0, ddof=1)
    rmse_per_score = np.sqrt(mean_squared_error(original, recon, multioutput='raw_values'))
    return 1 - (rmse_per_score**2) / var  # can be negative

best = {}
results = []
for hcf_dim in [3, 5, 7]:
    pca_hcf = PCA(n_components=hcf_dim, random_state=0)
    Z_hcf = pca_hcf.fit_transform(hcf_scores)  # (n_hcf, hcf_dim)

    reg_aux2hcf = Ridge(alpha=1.0)
    reg_aux2hcf.fit(X_aux_hcf, Z_hcf)
    Z_hcf_from_aux = reg_aux2hcf.predict(X_aux_hcf)  # pseudo HCF latent

    for rev_dim in [5, 10, 15]:
        pca_review = PCA(n_components=rev_dim, random_state=0)
        Z_review = pca_review.fit_transform(X_review_base)  # (n_reviews, rev_dim)

        reg_aux2rev = Ridge(alpha=1.0)
        reg_aux2rev.fit(X_aux_reviews, Z_review)
        Z_rev_from_aux = reg_aux2rev.predict(X_aux_hcf)  # using HCF-side aux

        cca_n = min(hcf_dim, rev_dim, 5)
        cca = CCA(n_components=cca_n, max_iter=500)
        U, V = cca.fit_transform(Z_rev_from_aux, Z_hcf_from_aux)  # (n_hcf, cca_n)

        reg_u2hcf = Ridge()
        reg_u2hcf.fit(U, Z_hcf_from_aux)

        U_full = cca.transform(Z_rev_from_aux)
        Z_hcf_pred_latent = reg_u2hcf.predict(U_full)
        hcf_reconstructed = pca_hcf.inverse_transform(Z_hcf_pred_latent)

        rmse = np.sqrt(mean_squared_error(hcf_scores.values, hcf_reconstructed))
        expl_frac = explained_fraction(hcf_scores.values, hcf_reconstructed)
        avg_expl = expl_frac.mean()

        results.append({
            'hcf_dim': hcf_dim,
            'rev_dim': rev_dim,
            'cca_n': cca_n,
            'rmse': rmse,
            'avg_explained_frac': avg_expl,
            'per_score_explained_frac': expl_frac,
            'pca_hcf': pca_hcf,
            'pca_review': pca_review,
            'reg_aux2hcf': reg_aux2hcf,
            'reg_aux2rev': reg_aux2rev,
            'cca': cca,
            'reg_u2hcf': reg_u2hcf,
        })
        if 'best_avg' not in best or avg_expl > best['best_avg']:
            best.update({
                'best_avg': avg_expl,
                'hcf_dim': hcf_dim,
                'rev_dim': rev_dim,
                'cca_n': cca_n,
                'pipeline': results[-1]
            })




pipe = best['pipeline']
pca_hcf = pipe['pca_hcf']
pca_review = pipe['pca_review']
reg_aux2rev = pipe['reg_aux2rev']
cca = pipe['cca']
reg_u2hcf = pipe['reg_u2hcf']

def predict_hcf_from_review_latent(review_row):

    text = " ".join([str(review_row.get(f, "")) for f in text_fields])
    text_vec = tfidf.transform([text])
    text_red = svd.transform(text_vec)
    numeric = np.array([review_row.get(c, 0) for c in numeric_cols]).reshape(1, -1)
    X_base_new = np.hstack([text_red, numeric])


    aux_vec = ohe.transform(pd.DataFrame([{
        'Sector': review_row.get('Sector', 'missing'),
        'Industry': review_row.get('Industry', 'missing')
    }]))
    Z_rev_from_aux = reg_aux2rev.predict(aux_vec)
    U_proj = cca.transform(Z_rev_from_aux)
    z_hcf_latent = reg_u2hcf.predict(U_proj)
    hcf_pred = pca_hcf.inverse_transform(z_hcf_latent)
    return pd.Series(hcf_pred.flatten(), index=hcf_score_cols)


example = text17_df.iloc[0]



def compute_explained_fraction_of_pipeline():
    hcf_recon = pca_hcf.inverse_transform(
        reg_u2hcf.predict(cca.transform(reg_aux2rev.predict(X_aux_hcf)))
    )
    return explained_fraction(hcf_scores.values, hcf_recon)

print("Final explained fraction per score:", compute_explained_fraction_of_pipeline())

In [None]:
hcf_score_cols = [
    'Direct Management - hcf score',
    'Emotional Connection - hcf score',
    'Engagement - hcf score',
    'Extrinsic - hcf score',
    'Innovation - hcf score',
    'Organizational Alignment - hcf score',
    'Organizational Effectiveness - hcf score'
]
text_fields = ['Advice to Management', 'PROs', 'CONs', 'Summary', 'Description']
rating_cols = [
    'Rating: Overall', 'Rating: Work/Life Balance', 'Rating: Culture & Values',
    'Rating: Career Opportunities', 'Rating: Comp & Benefits', 'Rating: Senior Management',
    'Rating: Diversity & Inclusion'
]
numeric_cols = rating_cols + ['Fprob_PROs', 'Fprob_CONs', 'Fprob']
aux_cat_fields = ['Sector', 'Industry']

has_survey = 'SurveyYear' in text17_df.columns


hcf_valid = hcf_df.dropna(subset=hcf_score_cols).reset_index(drop=True)
hcf_scores = hcf_valid[hcf_score_cols].copy()  # (n_hcf, 7)
hcf_aux_cat = hcf_valid[aux_cat_fields].fillna('missing').copy()
# placeholder for missing review-like metadata
# We'll create review_len proxy as mean from reviews later


text17_df['combined_text'] = text17_df[text_fields].fillna('').agg(' '.join, axis=1)
for c in numeric_cols:
    if c not in text17_df.columns:
        text17_df[c] = 0.0
review_numeric = text17_df[numeric_cols].fillna(0)


tfidf_for_review = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words='english')
svd = TruncatedSVD(n_components=50, random_state=0)
text_mat = tfidf_for_review.fit_transform(text17_df['combined_text'])
text_latent = svd.fit_transform(text_mat)  # (n_reviews, 50)
X_review_base = np.hstack([text_latent, review_numeric.to_numpy()])


tfidf_topic = TfidfVectorizer(max_features=5000, stop_words='english')
W = tfidf_topic.fit_transform(text17_df['combined_text'])
n_topics = 10
nmf = NMF(n_components=n_topics, random_state=0, init='nndsvda', max_iter=500)
topic_weights = nmf.fit_transform(W)  # (n_reviews, n_topics)
topic_weights = topic_weights / (topic_weights.sum(axis=1, keepdims=True) + 1e-9)  # normalize


text17_df['review_len'] = text17_df['combined_text'].str.split().apply(len)
text17_df['log_review_len'] = np.log1p(text17_df['review_len'])


aux_reviews_df = pd.DataFrame({
    'Sector': text17_df['Sector'].fillna('missing'),
    'Industry': text17_df['Industry'].fillna('missing'),
    'log_review_len': text17_df['log_review_len'].fillna(0),
})
if has_survey:
    aux_reviews_df['SurveyYear'] = text17_df['SurveyYear'].astype(str).fillna('missing')
else:
    aux_reviews_df['SurveyYear'] = 'missing'


rev_topics_df = pd.concat([
    text17_df[['Sector', 'Industry']].reset_index(drop=True),
    pd.DataFrame(topic_weights, columns=[f"topic_{i}" for i in range(n_topics)])
], axis=1)
group_mean_topics = rev_topics_df.groupby(['Sector', 'Industry']).mean().reset_index()

hcf_topic_merge = hcf_aux_cat.merge(
    group_mean_topics,
    on=['Sector', 'Industry'],
    how='left'
)

global_topic_mean = topic_weights.mean(axis=0)


for i in range(n_topics):
    col = f"topic_{i}"
    if col not in hcf_topic_merge:
        hcf_topic_merge[col] = global_topic_mean[i]
    hcf_topic_merge[col] = hcf_topic_merge[col].fillna(global_topic_mean[i])

hcf_topic_weights = hcf_topic_merge[[f"topic_{i}" for i in range(n_topics)]]




mean_log_len = text17_df['log_review_len'].mean()
hcf_aux_df = pd.DataFrame({
    'Sector': hcf_valid['Sector'].fillna('missing'),
    'Industry': hcf_valid['Industry'].fillna('missing'),
    'log_review_len': np.full(len(hcf_valid), mean_log_len),
})
if has_survey:
    hcf_aux_df['SurveyYear'] = 'missing'
else:
    hcf_aux_df['SurveyYear'] = 'missing'


cat_ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_ohe.fit(pd.concat([aux_reviews_df[['Sector','Industry']], hcf_aux_df[['Sector','Industry']]], axis=0))


cat_rev = cat_ohe.transform(aux_reviews_df[['Sector','Industry']])
parts_rev = [cat_rev, aux_reviews_df[['log_review_len']].to_numpy(), topic_weights]
if has_survey:
    year_ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    year_enc_rev = year_ohe.fit_transform(aux_reviews_df[['SurveyYear']])
    parts_rev.append(year_enc_rev)
X_aux_reviews_enriched = np.hstack(parts_rev)


cat_hcf = cat_ohe.transform(hcf_aux_df[['Sector','Industry']])
parts_hcf = [cat_hcf, hcf_aux_df[['log_review_len']].to_numpy(), hcf_topic_weights.to_numpy()]
if has_survey:
    year_enc_hcf = year_ohe.transform(pd.DataFrame({'SurveyYear': ['missing'] * len(hcf_valid)}))
    parts_hcf.append(year_enc_hcf)
X_aux_hcf_enriched = np.hstack(parts_hcf)


def explained_fraction(original, recon):
    var = np.var(original, axis=0, ddof=1)
    rmse_per_score = np.sqrt(mean_squared_error(original, recon, multioutput='raw_values'))
    return 1 - (rmse_per_score**2) / var


pca_hcf = PCA(n_components=7, random_state=0) 
Z_hcf = pca_hcf.fit_transform(hcf_scores) 


pca_review = PCA(n_components=5, random_state=0)
Z_review = pca_review.fit_transform(X_review_base)


reg_aux2hcf = Ridge(alpha=1.0)
reg_aux2hcf.fit(X_aux_hcf_enriched, Z_hcf)  

reg_aux2rev = Ridge(alpha=1.0)
reg_aux2rev.fit(X_aux_reviews_enriched, Z_review)  


Z_hcf_from_aux = reg_aux2hcf.predict(X_aux_hcf_enriched)    
Z_rev_from_aux = reg_aux2rev.predict(X_aux_hcf_enriched)    

reg_rev2hcf = Ridge()
reg_rev2hcf.fit(Z_rev_from_aux, Z_hcf_from_aux)  
Z_hcf_pred_lin = reg_rev2hcf.predict(Z_rev_from_aux)
hcf_recon_lin = pca_hcf.inverse_transform(Z_hcf_pred_lin)
expl_frac_lin = explained_fraction(hcf_scores.values, hcf_recon_lin)
rmse_lin = np.sqrt(mean_squared_error(hcf_scores.values, hcf_recon_lin))

cca_n = min(Z_rev_from_aux.shape[1], Z_hcf_from_aux.shape[1], 5)
cca = CCA(n_components=cca_n, max_iter=2000)
U, V = cca.fit_transform(Z_rev_from_aux, Z_hcf_from_aux)
reg_u2hcf = Ridge()
reg_u2hcf.fit(U, Z_hcf_from_aux)
U_full = cca.transform(Z_rev_from_aux)
Z_hcf_pred_cca = reg_u2hcf.predict(U_full)
hcf_recon_cca = pca_hcf.inverse_transform(Z_hcf_pred_cca)
expl_frac_cca = explained_fraction(hcf_scores.values, hcf_recon_cca)
rmse_cca = np.sqrt(mean_squared_error(hcf_scores.values, hcf_recon_cca))




def predict_hcf_for_review_enriched(review_row):
    # Build enriched auxiliary for this review
    sector = review_row.get('Sector', 'missing')
    industry = review_row.get('Industry', 'missing')
    log_len = np.log1p(len(str(review_row.get('combined_text', "")).split()))
    survey = str(review_row.get('SurveyYear', 'missing')) if has_survey else 'missing'


    review_text = " ".join([str(review_row.get(f, "")) for f in text_fields])
    topic_vec = nmf.transform(tfidf_topic.transform([review_text]))  # (1, n_topics)
    topic_vec = topic_vec / (topic_vec.sum() + 1e-9)


    cat_part = cat_ohe.transform(pd.DataFrame([{'Sector': sector, 'Industry': industry}]))
    parts = [cat_part, np.array([[log_len]]), topic_vec]
    if has_survey:
        year_part = year_ohe.transform(pd.DataFrame([{'SurveyYear': survey}]))
        parts.append(year_part)
    aux_review = np.hstack(parts)  # (1, dim)


    z_rev_proxy = reg_aux2rev.predict(aux_review)  # (1, latent_review)

    z_hcf_lin = reg_rev2hcf.predict(z_rev_proxy)
    hcf_lin = pca_hcf.inverse_transform(z_hcf_lin).flatten()

    U_proj = cca.transform(z_rev_proxy)
    z_hcf_cca = reg_u2hcf.predict(U_proj)
    hcf_cca = pca_hcf.inverse_transform(z_hcf_cca).flatten()

    return {
        'hcf_pred_linear': pd.Series(hcf_lin, index=hcf_score_cols),
        'hcf_pred_cca': pd.Series(hcf_cca, index=hcf_score_cols),
    }


example = text17_df.iloc[0]
preds = predict_hcf_for_review_enriched(example)
print("Example linear-aligned HCF prediction:\n", preds['hcf_pred_linear'])
print("Example CCA-aligned HCF prediction:\n", preds['hcf_pred_cca'])


In [None]:

hcf_score_cols = [
    'Direct Management - hcf score',
    'Emotional Connection - hcf score',
    'Engagement - hcf score',
    'Extrinsic - hcf score',
    'Innovation - hcf score',
    'Organizational Alignment - hcf score',
    'Organizational Effectiveness - hcf score'
]
text_fields = ['Advice to Management', 'PROs', 'CONs', 'Summary', 'Description']
aux_cat_fields = ['Sector', 'Industry']

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

try:
    from sentence_transformers import SentenceTransformer
    st_model = SentenceTransformer("all-MiniLM-L6-v2")  # small and fast
    def get_text_embedding(texts):
        return st_model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
    print("Using sentence-transformer embeddings.")
except ImportError:
    print("sentence_transformers not installed; falling back to TF-IDF + PCA.")
    from sklearn.feature_extraction.text import TfidfVectorizer
    tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

    text17_df['combined_text'] = text17_df[text_fields].fillna('').agg(' '.join, axis=1)
    tfidf_mat = tfidf.fit_transform(text17_df['combined_text'])
    pca_text = PCA(n_components=128, random_state=0)
    text_reduced = pca_text.fit_transform(tfidf_mat.toarray())
    def get_text_embedding(texts):
        return pca_text.transform(tfidf.transform(texts).toarray())


hcf_valid = hcf_df.dropna(subset=hcf_score_cols).reset_index(drop=True)
hcf_scores = hcf_valid[hcf_score_cols].to_numpy() 

pca_hcf = PCA(n_components=5, random_state=1)
Z_hcf = pca_hcf.fit_transform(hcf_scores) 


hcf_valid['aux_key'] = hcf_valid[aux_cat_fields].fillna('missing').agg("__".join, axis=1)
text17_df['combined_text'] = text17_df[text_fields].fillna('').agg(' '.join, axis=1)
text17_df['aux_key'] = text17_df[aux_cat_fields].fillna('missing').agg("__".join, axis=1)


pairs = []
k_per_review = 3 
for rev_idx, rev in text17_df.iterrows():
    rev_sector = str(rev.get('Sector', 'missing')).strip()
    rev_ind = str(rev.get('Industry', 'missing')).strip()
    def match_score(hcf_row):
        score = 0
        if str(hcf_row['Sector']).strip() == rev_sector:
            score += 1
        if str(hcf_row['Industry']).strip() == rev_ind:
            score += 1
        return score  # 0,1,2

    hcf_valid = hcf_valid if 'hcf_valid' in globals() else hcf_df.dropna(subset=hcf_score_cols).reset_index(drop=True)

    scores = hcf_valid.apply(match_score, axis=1)
    best_idxs = scores[scores > 0].sort_values(ascending=False).index.tolist()
    if len(best_idxs) == 0:

        sampled_idxs = hcf_valid.sample(n=k_per_review, replace=True, random_state=rev_idx).index.tolist()
    else:

        if len(best_idxs) > k_per_review:
            sampled_idxs = random.sample(best_idxs, k_per_review)
        else:
            sampled_idxs = best_idxs
    for hcf_idx in sampled_idxs:
        pairs.append({
            'review_idx': rev_idx,
            'hcf_idx': hcf_idx,
            'rev_sector': rev_sector,
            'rev_industry': rev_ind,
            'hcf_sector': str(hcf_valid.loc[hcf_idx, 'Sector']).strip(),
            'hcf_industry': str(hcf_valid.loc[hcf_idx, 'Industry']).strip(),
        })

pairs_df = pd.DataFrame(pairs)

if pairs_df.empty:
    raise RuntimeError("No pseudo-pairs could be constructed; verify Sector/Industry columns in both datasets.")



review_texts = text17_df.loc[pairs_df['review_idx'], 'combined_text'].tolist()
E_rev = get_text_embedding(review_texts)  

E_hcf = Z_hcf[pairs_df['hcf_idx'].to_numpy()]  

scaler_rev = StandardScaler().fit(E_rev)
scaler_hcf = StandardScaler().fit(E_hcf)
E_rev_s = scaler_rev.transform(E_rev)
E_hcf_s = scaler_hcf.transform(E_hcf)


rev_tensor = torch.tensor(E_rev_s, dtype=torch.float32, device=device)
hcf_tensor = torch.tensor(E_hcf_s, dtype=torch.float32, device=device)


class ProjectionHead(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=32):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        x = self.net(x)
        x = F.normalize(x, dim=-1)
        return x

rev_proj = ProjectionHead(rev_tensor.shape[1]).to(device)
hcf_proj = ProjectionHead(hcf_tensor.shape[1]).to(device)


def contrastive_loss(z1, z2, temperature=0.1):

    batch_size = z1.shape[0]
    logits = torch.matmul(z1, z2.T) / temperature  # (B,B)
    labels = torch.arange(batch_size, device=device)
    loss1 = F.cross_entropy(logits, labels)
    loss2 = F.cross_entropy(logits.T, labels)
    return 0.5 * (loss1 + loss2)


optimizer = torch.optim.Adam(list(rev_proj.parameters()) + list(hcf_proj.parameters()), lr=1e-3)
n_epochs = 300
for epoch in range(n_epochs):
    rev_z = rev_proj(rev_tensor)
    hcf_z = hcf_proj(hcf_tensor)
    loss = contrastive_loss(rev_z, hcf_z)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 60 == 0:
        print(f"[Contrastive] Epoch {epoch+1}/{n_epochs} loss: {loss.item():.4f}")

all_review_texts = text17_df['combined_text'].tolist()
E_rev_all = get_text_embedding(all_review_texts)
E_rev_all_s = scaler_rev.transform(E_rev_all)
with torch.no_grad():
    Z_rev_shared = rev_proj(torch.tensor(E_rev_all_s, device=device, dtype=torch.float32)).cpu().numpy()  # normalized shared space


Z_hcf_s_all = scaler_hcf.transform(pca_hcf.transform(hcf_scores))
with torch.no_grad():
    Z_hcf_shared = hcf_proj(torch.tensor(Z_hcf_s_all, device=device, dtype=torch.float32)).cpu().numpy()


shared_rev_pairs = Z_rev_shared[pairs_df['review_idx'].to_numpy()]  
true_hcf_scores_pairs = hcf_scores[pairs_df['hcf_idx'].to_numpy()] 


calibrator = Ridge(alpha=1.0)
calibrator.fit(shared_rev_pairs, true_hcf_scores_pairs)


def predict_hcf_for_review_contrastive(review_row):

    text = " ".join([str(review_row.get(f, "")) for f in text_fields])

    emb = get_text_embedding([text]) 

    emb_s = scaler_rev.transform(emb)  
    with torch.no_grad():

        shared = rev_proj(torch.tensor(emb_s, device=device, dtype=torch.float32))
        shared = shared.cpu().numpy() 

    raw_pred = calibrator.predict(shared) 
    return pd.Series(raw_pred.flatten(), index=hcf_score_cols)


def calibrate_with_overlap(df_overlap):
    X_base = []
    y_true = []
    for _, row in df_overlap.iterrows():
        pred = predict_hcf_for_review_contrastive(row).to_numpy()
        X_base.append(pred)
        y_true.append(row[hcf_score_cols].to_numpy())
    X_base = np.vstack(X_base)
    y_true = np.vstack(y_true)
    residual_model = Ridge(alpha=1.0)
    residual_model.fit(X_base, y_true)
    return residual_model  
    
preds_on_pairs = calibrator.predict(shared_rev_pairs)
rmse_pseudo = np.sqrt(mean_squared_error(true_hcf_scores_pairs, preds_on_pairs))
print("Contrastive pipeline pseudo-pair RMSE:", rmse_pseudo)

example = text17_df.iloc[0]
print("Contrastive HCF prediction (review):\n", predict_hcf_for_review_contrastive(example))


In [None]:
shared_rev_pairs = Z_rev_shared[pairs_df['review_idx'].to_numpy()]  
true_hcf_scores_pairs = hcf_scores[pairs_df['hcf_idx'].to_numpy()] 

X_train, X_val, y_train, y_val = train_test_split(
    shared_rev_pairs, true_hcf_scores_pairs, test_size=0.15, random_state=0
)

def explained_fraction(original, recon):
    var = np.var(original, axis=0, ddof=1)
    rmse_per_score = np.sqrt(mean_squared_error(original, recon, multioutput='raw_values'))
    return 1 - (rmse_per_score**2) / var  # can be negative


ridge_cal = Ridge(alpha=1.0).fit(X_train, y_train)
mlp_cal = MLPRegressor(
    hidden_layer_sizes=(64, 32),
    activation='relu',
    early_stopping=True,
    random_state=0,
    max_iter=500,
    learning_rate_init=1e-3,
    tol=1e-4,
).fit(X_train, y_train)


def eval_calibrator(model, name):
    pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    expl = explained_fraction(y_val, pred)
    print(f"{name} RMSE (val): {rmse:.4f}")
    print(f"{name} explained fraction per score (val): {expl}")
    print(f"{name} avg explained frac: {expl.mean():.4f}")
    return pred


pred_ridge = eval_calibrator(ridge_cal, "Ridge")
pred_mlp = eval_calibrator(mlp_cal, "MLP")


best_calibrator = mlp_cal if explained_fraction(y_val, pred_mlp).mean() >= explained_fraction(y_val, pred_ridge).mean() else ridge_cal
best_name = "MLP" if best_calibrator is mlp_cal else "Ridge"
print(f"Selected best calibrator: {best_name}")


residual_model = None
def fit_residual_overlap(df_overlap):
    global residual_model
    X_overlap = []
    y_overlap = []
    for _, row in df_overlap.iterrows():

        shared = rev_proj(torch.tensor(scaler_rev.transform(get_text_embedding(
            [" ".join([str(row.get(f, "")) for f in text_fields])])[0].reshape(1, -1)
        ), device=device, dtype=torch.float32))
        shared_np = shared.cpu().numpy().reshape(1, -1)
        base_pred = best_calibrator.predict(shared_np)  # (1,7)
        X_overlap.append(base_pred.flatten())
        y_overlap.append(row[hcf_score_cols].to_numpy())
    X_overlap = np.vstack(X_overlap)
    y_overlap = np.vstack(y_overlap)
    residual_model = Ridge(alpha=1.0).fit(X_overlap, y_overlap)
    print("Fitted residual correction on real overlap.")
    return residual_model  # final prediction: residual_model(best_calibrator(...))


def predict_hcf_with_nonlinear(review_row):

    text = " ".join([str(review_row.get(f, "")) for f in text_fields])
    emb = get_text_embedding([text])
    emb_s = scaler_rev.transform(emb)
    with torch.no_grad():
        shared = rev_proj(torch.tensor(emb_s, device=device, dtype=torch.float32)).cpu().numpy()
    base_pred = best_calibrator.predict(shared.reshape(1, -1)).flatten()
    if residual_model is not None:
        corrected = residual_model.predict(base_pred.reshape(1, -1)).flatten()
        return pd.Series(corrected, index=hcf_score_cols)
    else:
        return pd.Series(base_pred, index=hcf_score_cols)

example = text17_df.iloc[0]
print(f"Final predicted HCF scores (nonlinear calibrated) for example:\n{predict_hcf_with_nonlinear(example)}")
