In [1]:
import pandas as pd
from scipy.stats import pearsonr, spearmanr

## Compare the correlations of the real and pseudo pairs

In [4]:
pseudo_whole_corpus = pd.read_csv("../results/pseudo_pairs_whole_corpus_linguistic_analysis.csv")
pseudo_baseline_df = pd.read_csv('../results/pseudo_pairs_baseline_linguistic_analysis.csv')
pseudo_unimodal_df = pd.read_csv('../results/pseudo_pairs_unimodal_linguistic_analysis.csv')
pseudo_unfamiliar_df = pd.read_csv('../results/pseudo_pairs_unfamiliar_linguistic_analysis.csv')
pseudo_first_third_df  = pd.read_csv('../results/pseudo_pairs_first_third_linguistic_analysis.csv')
pseudo_second_third_df = pd.read_csv('../results/pseudo_pairs_second_third_linguistic_analysis.csv')
pseudo_third_third_df  = pd.read_csv('../results/pseudo_pairs_third_third_linguistic_analysis.csv')

In [5]:
whole_corpus_correlation = pd.read_csv("../results/correlations_whole_corpus.csv")
baseline_correlation = pd.read_csv("../results/correlations_baseline.csv") #40db
unimodal_correlation = pd.read_csv("../results/correlations_unimodal.csv") #41db
unfamiliar_correlation = pd.read_csv("../results/correlations_unfamiliar.csv") #43db

In [38]:
first_third_correlation = pd.read_csv("../results/correlations_first_third.csv")
second_third_correlation = pd.read_csv("../results/correlations_second_third.csv")
third_third_correlation = pd.read_csv("../results/correlations_third_third.csv")

## Get random samples from pseudo data

In [6]:
def random_sampling(df, num_lines):
    # Create empty list to store all samples
    samples_list = []
    columns = df.columns.tolist()

    # Perform random sampling 1000 times
    for _ in range(1000):
        # Randomly select x lines from the dataframe
        sample = df.sample(n=num_lines)[columns]
        samples_list.append(sample)

    return samples_list


In [7]:
pseudo_whole_corpus_sampled_dfs = random_sampling(pseudo_whole_corpus, 124)
pseudo_baseline_sampled_dfs = random_sampling(pseudo_baseline_df, 40)
pseudo_unimodal_sampled_dfs = random_sampling(pseudo_unimodal_df, 41)
pseudo_unfamiliar_sampled_dfs = random_sampling(pseudo_unfamiliar_df, 43)
pseudo_first_third_sampled_dfs = random_sampling(pseudo_first_third_df, 124)
pseudo_second_third_sampled_dfs = random_sampling(pseudo_second_third_df, 124)
pseudo_third_third_sampled_dfs = random_sampling(pseudo_third_third_df, 124)

## Calculate correlation of pseudo data

In [None]:
def Correlations(df_list):
    columns = [
        "lexical_diversity", "speech_ratio", "word_avg", "hes", "laugh", "question", "hum",
        "s_first_person", "s_second_person", "s_third_person", "p_first_person", "p_second_person",
        "p_third_person", "present", "past", "future", "cond", "verb", "noun", "adj", "adv", "num",
        "article", "preverb", "postverb", "pronoun", "conjunction", "interjection", "det", "funct_words",
        "negation", "happiness", "sadness", "anger", "fear", "disgust", "no_emotion", "positive",
        "negative", "neutral"
    ]
    correlation_dfs = []

    for df in df_list:
        results = []
        for col in columns:
            corr_pearson, p_value_pearson = pearsonr(df[f"{col}_mordor"], df[f"{col}_gondor"])
            corr_spearman, p_value_spearman = spearmanr(df[f"{col}_mordor"], df[f"{col}_gondor"])
            results.append({
                "Feature": f"{col}",
                "Correlation_pearson": round(corr_pearson, 2),
                "P-Value": round(p_value_pearson, 2),
                "P-Value < 0.005": p_value_pearson < 0.005
            })
        correlation_dfs.append(pd.DataFrame(results))

    return correlation_dfs

In [None]:
whole_pseudo_corpus_correlation_dfs = Correlations(pseudo_whole_corpus_sampled_dfs)

In [None]:
baseline_pseudo_correlation_dfs = Correlations(pseudo_baseline_sampled_dfs)

In [None]:
unimodal_pseudo_correlation_dfs = Correlations(pseudo_unimodal_sampled_dfs)

In [None]:
unfamiliar_pseudo_correlation_dfs = Correlations(pseudo_unfamiliar_sampled_dfs)

In [None]:
first_third_pseudo_correlation_dfs = Correlations(pseudo_first_third_sampled_dfs)

In [None]:
second_third_pseudo_correlation_dfs = Correlations(pseudo_second_third_sampled_dfs)

In [None]:
third_third_pseudo_correlation_dfs = Correlations(pseudo_third_third_sampled_dfs)

## Calculate the mean of the correlations

In [None]:
def calculate_mean_correlation(correlation_dfs):
    feature_dict = {}
    
    for df in correlation_dfs:
        for _, row in df.iterrows():
            feature = row['Feature']
            corr = abs(row['Correlation_pearson'])
            
            if feature not in feature_dict:
                feature_dict[feature] = []
            feature_dict[feature].append(corr)
    
    results = {
        'Feature': [],
        'Mean': []
    }
    for feature, values in feature_dict.items():
        results['Feature'].append(feature)
        results['Mean'].append(round(sum(values) / len(values),2))
    
    return pd.DataFrame(results)

In [None]:
whole_pseudo_corpus_correlation_means = calculate_mean_correlation(whole_pseudo_corpus_correlation_dfs)
pseudo_baseline_correlation_means = calculate_mean_correlation(baseline_pseudo_correlation_dfs)
pseudo_unimodal_correlation_means = calculate_mean_correlation(unimodal_pseudo_correlation_dfs)
pseudo_unfamiliar_correlation_means = calculate_mean_correlation(unfamiliar_pseudo_correlation_dfs)
pseudo_first_third_correlation_means = calculate_mean_correlation(first_third_pseudo_correlation_dfs)
pseudo_second_third_correlation_means = calculate_mean_correlation(second_third_pseudo_correlation_dfs)
pseudo_third_third_correlation_means = calculate_mean_correlation(third_third_pseudo_correlation_dfs)

In [None]:
whole_pseudo_corpus_correlation_means.to_csv("../results/whole_pseudo_corpus_correlation_means.csv", index=False)
pseudo_baseline_correlation_means.to_csv("../results/baseline_pseudo_correlation_means.csv", index=False)
pseudo_unimodal_correlation_means.to_csv("../results/unimodal_pseudo_correlation_means.csv", index=False)
pseudo_unfamiliar_correlation_means.to_csv("../results/unfamiliar_pseudo_correlation_means.csv", index=False)
pseudo_first_third_correlation_means.to_csv("../results/first_third_pseudo_correlation_means.csv", index=False)
pseudo_second_third_correlation_means.to_csv("../results/second_third_pseudo_correlation_means.csv", index=False)
pseudo_third_third_correlation_means.to_csv("../results/third_third_pseudo_correlation_means.csv", index=False)

## Compare the correlation values of the real and of the pseudo data

In [None]:
def compare_correlations(original_df, pseudo_dfs_list):
    results = {
        'Feature': original_df['Feature'].tolist(),
        'Pseudo_higher_than_original': [0] * len(original_df)
    }
    
    for idx, orig_row in original_df.iterrows():
        orig_corr = orig_row['Correlation_pearson']
        
        for pseudo_df in pseudo_dfs_list:
            # Get correlation value from pseudo df at same feature
            pseudo_corr = pseudo_df.loc[pseudo_df['Feature'] == orig_row['Feature'], 'Correlation_pearson'].iloc[0]
            
            # Increment counter if pseudo correlation is higher
            if abs(pseudo_corr) >= abs(orig_corr):
                results['Pseudo_higher_than_original'][idx] += 1
    
    return pd.DataFrame(results)


In [45]:
comparison_whole_corpus = compare_correlations(whole_corpus_correlation, whole_pseudo_corpus_correlation_dfs)
# select those features that are lower than 50
comparison_whole_corpus = comparison_whole_corpus[comparison_whole_corpus['Pseudo_higher_than_original'] < 50]

Unnamed: 0,Feature,Pseudo_higher_than_original
1,speech_ratio,0
2,word_avg,0
3,hes,0
4,laugh,0
7,s_first_person,0
9,s_third_person,0
10,p_first_person,0
12,p_third_person,0
13,present,0
14,past,0


In [None]:
comparison_baseline_corpus = compare_correlations(baseline_correlation, baseline_pseudo_correlation_dfs)
comparison_baseline_corpus = comparison_baseline_corpus[comparison_baseline_corpus['Pseudo_higher_than_original'] < 50]
comparison_baseline_corpus

In [None]:
comparison_unimodal_corpus = compare_correlations(unimodal_correlation, unimodal_pseudo_correlation_dfs)
comparison_unimodal_corpus = comparison_unimodal_corpus[comparison_unimodal_corpus['Pseudo_higher_than_original'] < 50]
comparison_unimodal_corpus

In [None]:
comparison_unfamiliar_corpus = compare_correlations(unfamiliar_correlation, unfamiliar_pseudo_correlation_dfs)
comparison_unfamiliar_corpus = comparison_unfamiliar_corpus[comparison_unfamiliar_corpus['Pseudo_higher_than_original'] < 50]

In [None]:
comparison_first_third_corpus = compare_correlations(first_third_correlation, first_third_pseudo_correlation_dfs)
comparison_unfamiliar_corpus
comparison_first_third_corpus = comparison_first_third_corpus[comparison_first_third_corpus['Pseudo_higher_than_original'] < 50]

In [None]:
comparison_second_third_corpus = compare_correlations(second_third_correlation, second_third_pseudo_correlation_dfs)
comparison_first_third_corpus
comparison_second_third_corpus = comparison_second_third_corpus[comparison_second_third_corpus['Pseudo_higher_than_original'] < 50]

In [None]:
comparison_third_third_corpus = compare_correlations(third_third_correlation, third_third_pseudo_correlation_dfs)
comparison_second_third_corpus
comparison_third_third_corpus = comparison_third_third_corpus[comparison_third_third_corpus['Pseudo_higher_than_original'] < 50]

In [49]:
whole_corpus_compare = comparison_whole_corpus.merge(whole_corpus_correlation[['Feature', 'Correlation_pearson']], on='Feature')
whole_corpus_compare.rename(columns={'Correlation_pearson': 'Original_Correlation_pearson'}, inplace=True)
whole_corpus_compare = whole_corpus_compare.merge(whole_pseudo_corpus_correlation_means[['Feature', 'Mean']], on='Feature')

In [50]:
baseline_compare = comparison_baseline_corpus.merge(baseline_correlation[['Feature', 'Correlation_pearson']], on='Feature')
baseline_compare.rename(columns={'Correlation_pearson': 'Original_Correlation_pearson'}, inplace=True)
baseline_compare = baseline_compare.merge(pseudo_baseline_correlation_means[['Feature', 'Mean']], on='Feature')

In [51]:
unimodal_compare = comparison_unimodal_corpus.merge(unimodal_correlation[['Feature', 'Correlation_pearson']], on='Feature')
unimodal_compare.rename(columns={'Correlation_pearson': 'Original_Correlation_pearson'}, inplace=True)
unimodal_compare = unimodal_compare.merge(pseudo_unimodal_correlation_means[['Feature', 'Mean']], on='Feature')

In [52]:
unfamiliar_compare = comparison_unfamiliar_corpus.merge(unfamiliar_correlation[['Feature', 'Correlation_pearson']], on='Feature')
unfamiliar_compare.rename(columns={'Correlation_pearson': 'Original_Correlation_pearson'}, inplace=True)
unfamiliar_compare = unfamiliar_compare.merge(pseudo_unfamiliar_correlation_means[['Feature', 'Mean']], on='Feature')

In [53]:
first_third_compare = comparison_first_third_corpus.merge(first_third_correlation[['Feature', 'Correlation_pearson']], on='Feature')
first_third_compare.rename(columns={'Correlation_pearson': 'Original_Correlation_pearson'}, inplace=True)
first_third_compare = first_third_compare.merge(pseudo_first_third_correlation_means[['Feature', 'Mean']], on='Feature')

In [54]:
second_third_compare = comparison_second_third_corpus.merge(second_third_correlation[['Feature', 'Correlation_pearson']], on='Feature')
second_third_compare.rename(columns={'Correlation_pearson': 'Original_Correlation_pearson'}, inplace=True)
second_third_compare = second_third_compare.merge(pseudo_second_third_correlation_means[['Feature', 'Mean']], on='Feature')

In [56]:
third_third_compare = comparison_third_third_corpus.merge(third_third_correlation[['Feature', 'Correlation_pearson']], on='Feature')
third_third_compare.rename(columns={'Correlation_pearson': 'Original_Correlation_pearson'}, inplace=True)
third_third_compare = third_third_compare.merge(pseudo_third_third_correlation_means[['Feature', 'Mean']], on='Feature')

In [57]:
whole_corpus_compare.to_csv("../results/whole_corpus_compare.csv", index=False)
baseline_compare.to_csv("../results/baseline_compare.csv", index=False)
unimodal_compare.to_csv("../results/unimodal_compare.csv", index=False)
unfamiliar_compare.to_csv("../results/unfamiliar_compare.csv", index=False)
first_third_compare.to_csv("../results/first_third_compare.csv", index=False)
second_third_compare.to_csv("../results/second_third_compare.csv", index=False)
third_third_compare.to_csv("../results/third_third_compare.csv", index=False)

## Compare the significant p-values of the real dataframes


In [None]:
correlation_summary = pd.read_csv("../results/correlation_summary.csv")
correlation_time = correlation_summary.copy()
correlation_time = correlation_time[["Feature", "Correlation_pearson_first_third", "P-Value_first_third", "Correlation_pearson_second_third", "P-Value_second_third", "Correlation_pearson_third_third", "P-Value_third_third"]]
correlation_time


## Compare correlations whether the differences are significant or not

In [None]:
def compare_correlations_steiger(correlation_time):
    comparison_results = []
    n = 124  
    for _, row in correlation_time.iterrows():
        first_second = compare_correlations_steiger_test(
            row['Correlation_pearson_first_third'], 
            row['Correlation_pearson_second_third'],
            n
        )
        second_third = compare_correlations_steiger_test(
            row['Correlation_pearson_second_third'],
            row['Correlation_pearson_third_third'], 
            n
        )
        first_third = compare_correlations_steiger_test(
            row['Correlation_pearson_first_third'],
            row['Correlation_pearson_third_third'],
            n
        )
        comparison_results.append({
            'Feature': row['Feature'],
            'First_vs_Second_Significant': first_second,
            'Second_vs_Third_Significant': second_third,
            'First_vs_Third_Significant': first_third,
            'First_Third_Correlation': row['Correlation_pearson_first_third'],
            'Second_Third_Correlation': row['Correlation_pearson_second_third'], 
            'Third_Third_Correlation': row['Correlation_pearson_third_third']
        })
    
    return pd.DataFrame(comparison_results)

def compare_correlations_steiger_test(corr1, corr2, n, alpha=0.05):
    """Helper function to perform Steiger's Z-test"""
    z1 = np.arctanh(corr1)
    z2 = np.arctanh(corr2)
    se = np.sqrt(1/(n-3))
    z = (z1 - z2) / (np.sqrt(2*se**2))
    p_value = 2 * (1 - norm.cdf(abs(z)))
    return p_value < alpha

# Run the comparison
significance_df = compare_correlations_steiger(correlation_time)

In [None]:
significance_df.to_csv("../results/significance_df.csv", index=False)

## Compare LSM

In [8]:
whole_data_lsm = pd.read_csv("../results/whole_data_lsm.csv")
baseline_lsm = pd.read_csv("../results/baseline_lsm.csv")
unimodal_lsm = pd.read_csv("../results/unimodal_lsm.csv")
unfamiliar_lsm = pd.read_csv("../results/unfamiliar_lsm.csv")
first_third_lsm = pd.read_csv("../results/first_third_lsm.csv")
second_third_lsm = pd.read_csv("../results/second_third_lsm.csv")
third_third_lsm = pd.read_csv("../results/third_third_lsm.csv")

In [9]:
whole_data_lsm_adapted = pd.read_csv("../results/whole_data_lsm_adapted.csv")
baseline_lsm_adapted = pd.read_csv("../results/baseline_lsm_adapted.csv")
unimodal_lsm_adapted = pd.read_csv("../results/unimodal_lsm_adapted.csv")
unfamiliar_lsm_adapted = pd.read_csv("../results/unfamiliar_lsm_adapted.csv")
first_third_lsm_adapted = pd.read_csv("../results/first_third_lsm_adapted.csv")
second_third_lsm_adapted = pd.read_csv("../results/second_third_lsm_adapted.csv")
third_third_lsm_adapted = pd.read_csv("../results/third_third_lsm_adapted.csv")

In [10]:
whole_data_lsm_means = round(whole_data_lsm.mean(), 2)
baseline_lsm_means = round(baseline_lsm.mean(), 2)
unimodal_lsm_means = round(unimodal_lsm.mean(), 2)
unfamiliar_lsm_means = round(unfamiliar_lsm.mean(), 2)
first_third_lsm_means = round(first_third_lsm.mean(), 2)
second_third_lsm_means = round(second_third_lsm.mean(), 2)
third_third_lsm_means = round(third_third_lsm.mean(), 2)

In [11]:
whole_data_lsm_means_adapted = round(whole_data_lsm_adapted.mean(), 2)
baseline_lsm_means_adapted = round(baseline_lsm_adapted.mean(), 2)
unimodal_lsm_means_adapted = round(unimodal_lsm_adapted.mean(), 2)
unfamiliar_lsm_means_adapted = round(unfamiliar_lsm_adapted.mean(), 2)
first_third_lsm_means_adapted = round(first_third_lsm_adapted.mean(), 2)
second_third_lsm_means_adapted = round(second_third_lsm_adapted.mean(), 2)
third_third_lsm_means_adapted = round(third_third_lsm_adapted.mean(), 2)

In [119]:
whole_data_lsm_means = pd.DataFrame(whole_data_lsm_means).reset_index()
whole_data_lsm_means.columns = ['Feature', 'Real_Mean']
whole_data_lsm_means_transformed = pd.DataFrame({
    row['Feature']: [row['Real_Mean']] 
    for _, row in whole_data_lsm_means.iterrows()
})
whole_data_lsm_means_transformed.rename(columns={"pair_no": "sample_no"}, inplace=True)

In [120]:
baseline_lsm_means = pd.DataFrame(baseline_lsm_means).reset_index()
baseline_lsm_means.columns = ['Feature', 'Real_Mean']
baseline_lsm_means_transformed = pd.DataFrame({
    row['Feature']: [row['Real_Mean']] 
    for _, row in baseline_lsm_means.iterrows()
})
baseline_lsm_means_transformed.rename(columns={"pair_no": "sample_no"}, inplace=True)

In [121]:
unimodal_lsm_means = pd.DataFrame(unimodal_lsm_means).reset_index()
unimodal_lsm_means.columns = ['Feature', 'Real_Mean']
unimodal_lsm_means_transformed = pd.DataFrame({
    row['Feature']: [row['Real_Mean']] 
    for _, row in unimodal_lsm_means.iterrows()
})
unimodal_lsm_means_transformed.rename(columns={"pair_no": "sample_no"}, inplace=True)

In [122]:
unfamiliar_lsm_means = pd.DataFrame(unfamiliar_lsm_means).reset_index()
unfamiliar_lsm_means.columns = ['Feature', 'Real_Mean']
unfamiliar_lsm_means_transformed = pd.DataFrame({
    row['Feature']: [row['Real_Mean']] 
    for _, row in unfamiliar_lsm_means.iterrows()
})
unfamiliar_lsm_means_transformed.rename(columns={"pair_no": "sample_no"}, inplace=True)

In [123]:
first_third_lsm_means = pd.DataFrame(first_third_lsm_means).reset_index()
first_third_lsm_means.columns = ['Feature', 'Real_Mean']
first_third_lsm_means_transformed = pd.DataFrame({
    row['Feature']: [row['Real_Mean']] 
    for _, row in first_third_lsm_means.iterrows()
})
first_third_lsm_means_transformed.rename(columns={"pair_no": "sample_no"}, inplace=True)


In [124]:
second_third_lsm_means = pd.DataFrame(second_third_lsm_means).reset_index()
second_third_lsm_means.columns = ['Feature', 'Real_Mean']
second_third_lsm_means_transformed = pd.DataFrame({
    row['Feature']: [row['Real_Mean']] 
    for _, row in second_third_lsm_means.iterrows()
})
second_third_lsm_means_transformed.rename(columns={"pair_no": "sample_no"}, inplace=True)

In [12]:
third_third_lsm_means = pd.DataFrame(third_third_lsm_means).reset_index()
third_third_lsm_means.columns = ['Feature', 'Real_Mean']
third_third_lsm_means_transformed = pd.DataFrame({
    row['Feature']: [row['Real_Mean']] 
    for _, row in third_third_lsm_means.iterrows()
})
third_third_lsm_means_transformed.rename(columns={"pair_no": "sample_no"}, inplace=True)

In [126]:
whole_data_lsm_means_adapted = pd.DataFrame(whole_data_lsm_means_adapted).reset_index()
whole_data_lsm_means_adapted.columns = ['Feature', 'Real_Mean']
whole_data_lsm_means_adapted_transformed = pd.DataFrame({
    row['Feature']: [row['Real_Mean']] 
    for _, row in whole_data_lsm_means_adapted.iterrows()
})
whole_data_lsm_means_adapted_transformed.rename(columns={"pair_no": "sample_no"}, inplace=True)

In [127]:
baseline_lsm_means_adapted = pd.DataFrame(baseline_lsm_means_adapted).reset_index()
baseline_lsm_means_adapted.columns = ['Feature', 'Real_Mean']
baseline_lsm_means_adapted_transformed = pd.DataFrame({
    row['Feature']: [row['Real_Mean']] 
    for _, row in baseline_lsm_means_adapted.iterrows()
})
baseline_lsm_means_adapted_transformed.rename(columns={"pair_no": "sample_no"}, inplace=True)

In [128]:
unimodal_lsm_means_adapted = pd.DataFrame(unimodal_lsm_means_adapted).reset_index()
unimodal_lsm_means_adapted.columns = ['Feature', 'Real_Mean']
unimodal_lsm_means_adapted_transformed = pd.DataFrame({
    row['Feature']: [row['Real_Mean']] 
    for _, row in unimodal_lsm_means_adapted.iterrows()
})
unimodal_lsm_means_adapted_transformed.rename(columns={"pair_no": "sample_no"}, inplace=True)

In [129]:
unfamiliar_lsm_means_adapted = pd.DataFrame(unfamiliar_lsm_means_adapted).reset_index()
unfamiliar_lsm_means_adapted.columns = ['Feature', 'Real_Mean']
unfamiliar_lsm_means_adapted_transformed = pd.DataFrame({
    row['Feature']: [row['Real_Mean']] 
    for _, row in unfamiliar_lsm_means_adapted.iterrows()
})
unfamiliar_lsm_means_adapted_transformed.rename(columns={"pair_no": "sample_no"}, inplace=True)

In [130]:
first_third_lsm_means_adapted = pd.DataFrame(first_third_lsm_means_adapted).reset_index()
first_third_lsm_means_adapted.columns = ['Feature', 'Real_Mean']
first_third_lsm_means_adapted_transformed = pd.DataFrame({
    row['Feature']: [row['Real_Mean']]
    for _, row in first_third_lsm_means_adapted.iterrows()
})
first_third_lsm_means_adapted_transformed.rename(columns={"pair_no": "sample_no"}, inplace=True)

In [131]:
second_third_lsm_means_adapted = pd.DataFrame(second_third_lsm_means_adapted).reset_index()
second_third_lsm_means_adapted.columns = ['Feature', 'Real_Mean']
second_third_lsm_means_adapted_transformed = pd.DataFrame({
    row['Feature']: [row['Real_Mean']]
    for _, row in second_third_lsm_means_adapted.iterrows()
})
second_third_lsm_means_adapted_transformed.rename(columns={"pair_no": "sample_no"}, inplace=True)

In [13]:
third_third_lsm_means_adapted = pd.DataFrame(third_third_lsm_means_adapted).reset_index()
third_third_lsm_means_adapted.columns = ['Feature', 'Real_Mean']
third_third_lsm_means_adapted_transformed = pd.DataFrame({
    row['Feature']: [row['Real_Mean']]
    for _, row in third_third_lsm_means_adapted.iterrows()
})
third_third_lsm_means_adapted_transformed.rename(columns={"pair_no": "sample_no"}, inplace=True)



In [133]:
whole_data_lsm_means_transformed.to_csv("../results/whole_data_lsm_means.csv", index=False)
baseline_lsm_means_transformed.to_csv("../results/baseline_lsm_means.csv", index=False)
unimodal_lsm_means_transformed.to_csv("../results/unimodal_lsm_means.csv", index=False)
unfamiliar_lsm_means_transformed.to_csv("../results/unfamiliar_lsm_means.csv", index=False)
first_third_lsm_means_transformed.to_csv("../results/first_third_lsm_means.csv", index=False)
second_third_lsm_means_transformed.to_csv("../results/second_third_lsm_means.csv", index=False)
third_third_lsm_means_transformed.to_csv("../results/third_third_lsm_means.csv", index=False)
whole_data_lsm_means_adapted_transformed.to_csv("../results/whole_data_lsm_means_adapted.csv", index=False)
baseline_lsm_means_adapted_transformed.to_csv("../results/baseline_lsm_means_adapted.csv", index=False)
unimodal_lsm_means_adapted_transformed.to_csv("../results/unimodal_lsm_means_adapted.csv", index=False)
unfamiliar_lsm_means_adapted_transformed.to_csv("../results/unfamiliar_lsm_means_adapted.csv", index=False)
first_third_lsm_means_adapted_transformed.to_csv("../results/first_third_lsm_means_adapted.csv", index=False)
second_third_lsm_means_adapted_transformed.to_csv("../results/second_third_lsm_means_adapted.csv", index=False)
third_third_lsm_means_adapted_transformed.to_csv("../results/third_third_lsm_means_adapted.csv", index=False)


## Calculate pseudo LSM

In [14]:
def lsm_count(df_list):
    all_results = []
    
    for df in df_list:
        lsm_results = []
        for index, row in df.iterrows():
            lsm_adverb = 1 - ((abs(row['adv_gondor'] - row['adv_mordor'])) / (row['adv_gondor'] + row['adv_mordor'] + 0.00001))
            lsm_article = 1 - ((abs(row['article_gondor'] - row['article_mordor'])) / (row['article_gondor'] + row['article_mordor'] + 0.00001))
            lsm_preverb = 1 - ((abs(row['preverb_gondor'] - row['preverb_mordor'])) / (row['preverb_gondor'] + row['preverb_mordor'] + 0.00001))
            lsm_postverb = 1 - ((abs(row['postverb_gondor'] - row['postverb_mordor'])) / (row['postverb_gondor'] + row['postverb_mordor'] + 0.00001))
            lsm_pronoun = 1 - ((abs(row['pronoun_gondor'] - row['pronoun_mordor'])) / (row['pronoun_gondor'] + row['pronoun_mordor'] + 0.00001))
            lsm_conjunction = 1 - ((abs(row['conjunction_gondor'] - row['conjunction_mordor'])) / (row['conjunction_gondor'] + row['conjunction_mordor'] + 0.00001))
            lsm_interjection = 1 - ((abs(row['interjection_gondor'] - row['interjection_mordor'])) / (row['interjection_gondor'] + row['interjection_mordor'] + 0.00001))
            lsm_det = 1 - ((abs(row['det_gondor'] - row['det_mordor'])) / (row['det_gondor'] + row['det_mordor'] + 0.00001))

        
            lsm_avg = (lsm_adverb + lsm_article + lsm_interjection + lsm_conjunction + 
                      lsm_pronoun + lsm_postverb + lsm_preverb + lsm_det) / 8
            
            lsm_results.append({
                #"pair_no": pair_no,
                "lsm_article": lsm_article,
                "lsm_preverb": lsm_preverb,
                "lsm_postverb": lsm_postverb,
                "lsm_pronoun": lsm_pronoun,
                "lsm_conjunction": lsm_conjunction,
                "lsm_interjection": lsm_interjection,
                "lsm_adverb": lsm_adverb,
                "lsm_det": lsm_det,
                "lsm_avg": lsm_avg
            })
        
        all_results.append(pd.DataFrame(lsm_results))
    
    return all_results

## Calculate the adapted LSM for pseudo data

In [15]:
def lsm_count_adapted(df_list):
    all_results = []
    
    for df in df_list:
        lsm_results = []
        for index, row in df.iterrows():
            lsm_adverb = 1 - ((abs(row['adv_gondor'] - row['adv_mordor'])) / (row['adv_gondor'] + row['adv_mordor'] + 0.00001))
            lsm_article = 1 - ((abs(row['article_gondor'] - row['article_mordor'])) / (row['article_gondor'] + row['article_mordor'] + 0.00001))
            lsm_preverb = 1 - ((abs(row['preverb_gondor'] - row['preverb_mordor'])) / (row['preverb_gondor'] + row['preverb_mordor'] + 0.00001))
            lsm_postverb = 1 - ((abs(row['postverb_gondor'] - row['postverb_mordor'])) / (row['postverb_gondor'] + row['postverb_mordor'] + 0.00001))
            lsm_pronoun = 1 - ((abs(row['pronoun_gondor'] - row['pronoun_mordor'])) / (row['pronoun_gondor'] + row['pronoun_mordor'] + 0.00001))
            lsm_conjunction = 1 - ((abs(row['conjunction_gondor'] - row['conjunction_mordor'])) / (row['conjunction_gondor'] + row['conjunction_mordor'] + 0.00001))
            lsm_interjection = 1 - ((abs(row['interjection_gondor'] - row['interjection_mordor'])) / (row['interjection_gondor'] + row['interjection_mordor'] + 0.00001))
            lsm_det = 1 - ((abs(row['det_gondor'] - row['det_mordor'])) / (row['det_gondor'] + row['det_mordor'] + 0.00001))
            lsm_s_first_person = 1 - ((abs(row['s_first_person_gondor'] - row['s_first_person_mordor'])) / (row['s_first_person_gondor'] + row['s_first_person_mordor'] + 0.00001))
            lsm_s_second_person = 1 - ((abs(row['s_second_person_gondor'] - row['s_second_person_mordor'])) / (row['s_second_person_gondor'] + row['s_second_person_mordor'] + 0.00001))
            lsm_s_third_person = 1 - ((abs(row['s_third_person_gondor'] - row['s_third_person_mordor'])) / (row['s_third_person_gondor'] + row['s_third_person_mordor'] + 0.00001))
            lsm_p_first_person = 1 - ((abs(row['p_first_person_gondor'] - row['p_first_person_mordor'])) / (row['p_first_person_gondor'] + row['p_first_person_mordor'] + 0.00001))
            lsm_p_second_person = 1 - ((abs(row['p_second_person_gondor'] - row['p_second_person_mordor'])) / (row['p_second_person_gondor'] + row['p_second_person_mordor'] + 0.00001))
            lsm_p_third_person = 1 - ((abs(row['p_third_person_gondor'] - row['p_third_person_mordor'])) / (row['p_third_person_gondor'] + row['p_third_person_mordor'] + 0.00001))
            
            lsm_avg = (lsm_adverb + lsm_article + lsm_interjection + lsm_conjunction + 
                      lsm_pronoun + lsm_postverb + lsm_preverb + lsm_det + 
                      lsm_p_first_person + lsm_p_second_person + lsm_p_third_person + 
                      lsm_s_first_person + lsm_s_second_person + lsm_s_third_person) / 14
            
            #pair_no = int(row['PairNo'])
            lsm_results.append({
                #"pair_no": pair_no,
                "lsm_article": lsm_article,
                "lsm_preverb": lsm_preverb,
                "lsm_postverb": lsm_postverb,
                "lsm_pronoun": lsm_pronoun,
                "lsm_conjunction": lsm_conjunction,
                "lsm_interjection": lsm_interjection,
                "lsm_adverb": lsm_adverb,
                "lsm_s_first_person": lsm_s_first_person,
                "lsm_s_second_person": lsm_s_second_person,
                "lsm_s_third_person": lsm_s_third_person,
                "lsm_p_first_person": lsm_p_first_person,
                "lsm_p_second_person": lsm_p_second_person,
                "lsm_p_third_person": lsm_p_third_person,
                "lsm_det": lsm_det,
                "lsm_avg": lsm_avg
            })
        
        all_results.append(pd.DataFrame(lsm_results))
    
    return all_results

In [16]:
pseudo_whole_corpus_lsm = lsm_count(pseudo_whole_corpus_sampled_dfs)
pseudo_baseline_lsm = lsm_count(pseudo_baseline_sampled_dfs)
pseudo_unimodal_lsm = lsm_count(pseudo_unimodal_sampled_dfs)
pseudo_unfamiliar_lsm = lsm_count(pseudo_unfamiliar_sampled_dfs)
pseudo_first_third_lsm = lsm_count(pseudo_first_third_sampled_dfs)
pseudo_second_third_lsm = lsm_count(pseudo_second_third_sampled_dfs)
pseudo_third_third_lsm = lsm_count(pseudo_third_third_sampled_dfs)

In [17]:
pseudo_whole_corpus_lsm_adapted = lsm_count_adapted(pseudo_whole_corpus_sampled_dfs)
pseudo_baseline_lsm_adapted = lsm_count_adapted(pseudo_baseline_sampled_dfs)
pseudo_unimodal_lsm_adapted = lsm_count_adapted(pseudo_unimodal_sampled_dfs)
pseudo_unfamiliar_lsm_adapted = lsm_count_adapted(pseudo_unfamiliar_sampled_dfs)
pseudo_first_third_lsm_adapted = lsm_count_adapted(pseudo_first_third_sampled_dfs)
pseudo_second_third_lsm_adapted = lsm_count_adapted(pseudo_second_third_sampled_dfs)
pseudo_third_third_lsm_adapted = lsm_count_adapted(pseudo_third_third_sampled_dfs)


## Calculate the means of LSM

In [18]:
def calculate_means(dataframe_list):
    all_means = []
    
    for i, df in enumerate(dataframe_list):
        means = round(df.mean(), 2)
        means['sample_no'] = i + 1
        all_means.append(means)
    
    result_df = pd.DataFrame(all_means)
    
    cols = ['sample_no'] + [col for col in result_df.columns if col != 'sample_no']
    result_df = result_df[cols]
    
    return result_df

In [20]:
pseudo_whole_corpus_lsm_means = calculate_means(pseudo_whole_corpus_lsm)
pseudo_baseline_lsm_means = calculate_means(pseudo_baseline_lsm)
pseudo_unimodal_lsm_means = calculate_means(pseudo_unimodal_lsm)
pseudo_unfamiliar_lsm_means = calculate_means(pseudo_unfamiliar_lsm)
pseudo_first_third_lsm_means = calculate_means(pseudo_first_third_lsm)
pseudo_second_third_lsm_means = calculate_means(pseudo_second_third_lsm)
pseudo_third_third_lsm_means = calculate_means(pseudo_third_third_lsm)

In [35]:
pseudo_whole_corpus_lsm_adapted_means = calculate_means(pseudo_whole_corpus_lsm_adapted)
pseudo_baseline_lsm_adapted_means = calculate_means(pseudo_baseline_lsm_adapted)
pseudo_unimodal_lsm_adapted_means = calculate_means(pseudo_unimodal_lsm_adapted)
pseudo_unfamiliar_lsm_adapted_means = calculate_means(pseudo_unfamiliar_lsm_adapted)
pseudo_first_third_lsm_adapted_means = calculate_means(pseudo_first_third_lsm_adapted)
pseudo_second_third_lsm_adapted_means = calculate_means(pseudo_second_third_lsm_adapted)
pseudo_third_third_lsm_adapted_means = calculate_means(pseudo_third_third_lsm_adapted)

In [25]:
mean_pseudo_whole_corpus_lsm_means = round(pseudo_whole_corpus_lsm_means.mean(), 2)
mean_pseudo_baseline_lsm_means = round(pseudo_baseline_lsm_means.mean(), 2)
mean_pseudo_unimodal_lsm_means = round(pseudo_unimodal_lsm_means.mean(), 2)
mean_pseudo_unfamiliar_lsm_means = round(pseudo_unfamiliar_lsm_means.mean(), 2)
mean_pseudo_first_third_lsm_means = round(pseudo_first_third_lsm_means.mean(), 2)
mean_pseudo_second_third_lsm_means = round(pseudo_second_third_lsm_means.mean(), 2)
mean_pseudo_third_third_lsm_means = round(pseudo_third_third_lsm_means.mean(), 2)

In [36]:
mean_pseudo_whole_corpus_lsm_adapted_means = round(pseudo_whole_corpus_lsm_adapted_means.mean(), 2)
mean_pseudo_baseline_lsm_adapted_means = round(pseudo_baseline_lsm_adapted_means.mean(), 2)
mean_pseudo_unimodal_lsm_adapted_means = round(pseudo_unimodal_lsm_adapted_means.mean(), 2)
mean_pseudo_unfamiliar_lsm_adapted_means = round(pseudo_unfamiliar_lsm_adapted_means.mean(), 2)
mean_pseudo_first_third_lsm_adapted_means = round(pseudo_first_third_lsm_adapted_means.mean(), 2)
mean_pseudo_second_third_lsm_adapted_means = round(pseudo_second_third_lsm_adapted_means.mean(), 2)
mean_pseudo_third_third_lsm_adapted_means = round(pseudo_third_third_lsm_adapted_means.mean(), 2)


In [65]:
mean_of_means_whole_corpus = pd.DataFrame(mean_pseudo_whole_corpus_lsm_means).reset_index()
mean_of_means_whole_corpus.columns = ['Feature', 'Mean']
mean_of_means_baseline = pd.DataFrame(mean_pseudo_baseline_lsm_means).reset_index()
mean_of_means_baseline.columns = ['Feature', 'Mean']
mean_of_means_unimodal = pd.DataFrame(mean_pseudo_unimodal_lsm_means).reset_index()
mean_of_means_unimodal.columns = ['Feature', 'Mean']
mean_of_means_unfamiliar = pd.DataFrame(mean_pseudo_unfamiliar_lsm_means).reset_index()
mean_of_means_unfamiliar.columns = ['Feature', 'Mean']
mean_of_means_first_third = pd.DataFrame(mean_pseudo_first_third_lsm_means).reset_index()
mean_of_means_first_third.columns = ['Feature', 'Mean']
mean_of_means_second_third = pd.DataFrame(mean_pseudo_second_third_lsm_means).reset_index()
mean_of_means_second_third.columns = ['Feature', 'Mean']
mean_of_means_third_third = pd.DataFrame(mean_pseudo_third_third_lsm_means).reset_index()
mean_of_means_third_third.columns = ['Feature', 'Mean']

In [67]:
mean_of_means_whole_corpus_adapted = pd.DataFrame(mean_pseudo_whole_corpus_lsm_adapted_means).reset_index()
mean_of_means_whole_corpus_adapted.columns = ['Feature', 'Mean']
mean_of_means_baseline_adapted = pd.DataFrame(mean_pseudo_baseline_lsm_adapted_means).reset_index()
mean_of_means_baseline_adapted.columns = ['Feature', 'Mean']
mean_of_means_unimodal_adapted = pd.DataFrame(mean_pseudo_unimodal_lsm_adapted_means).reset_index()
mean_of_means_unimodal_adapted.columns = ['Feature', 'Mean']
mean_of_means_unfamiliar_adapted = pd.DataFrame(mean_pseudo_unfamiliar_lsm_adapted_means).reset_index()
mean_of_means_unfamiliar_adapted.columns = ['Feature', 'Mean']
mean_of_means_first_third_adapted = pd.DataFrame(mean_pseudo_first_third_lsm_adapted_means).reset_index()
mean_of_means_first_third_adapted.columns = ['Feature', 'Mean']
mean_of_means_second_third_adapted = pd.DataFrame(mean_pseudo_second_third_lsm_adapted_means).reset_index()
mean_of_means_second_third_adapted.columns = ['Feature', 'Mean']
mean_of_means_third_third_adapted = pd.DataFrame(mean_pseudo_third_third_lsm_adapted_means).reset_index()
mean_of_means_third_third_adapted.columns = ['Feature', 'Mean']

In [68]:
mean_of_means_whole_corpus_adapted.to_csv("../results/mean_of_means_whole_corpus_adapted.csv", index=False)
mean_of_means_baseline_adapted.to_csv("../results/mean_of_means_baseline_adapted.csv", index=False)
mean_of_means_unimodal_adapted.to_csv("../results/mean_of_means_unimodal_adapted.csv", index=False)
mean_of_means_unfamiliar_adapted.to_csv("../results/mean_of_means_unfamiliar_adapted.csv", index=False)
mean_of_means_first_third_adapted.to_csv("../results/mean_of_means_first_third_adapted.csv", index=False)
mean_of_means_second_third_adapted.to_csv("../results/mean_of_means_second_third_adapted.csv", index=False)
mean_of_means_third_third_adapted.to_csv("../results/mean_of_means_third_third_adapted.csv", index=False)

In [69]:
mean_of_means_whole_corpus.to_csv("../results/mean_of_means_whole_corpus.csv", index=False)
mean_of_means_baseline.to_csv("../results/mean_of_means_baseline.csv", index=False)
mean_of_means_unimodal.to_csv("../results/mean_of_means_unimodal.csv", index=False)
mean_of_means_unfamiliar.to_csv("../results/mean_of_means_unfamiliar.csv", index=False)
mean_of_means_first_third.to_csv("../results/mean_of_means_first_third.csv", index=False)
mean_of_means_second_third.to_csv("../results/mean_of_means_second_third.csv", index=False)
mean_of_means_third_third.to_csv("../results/mean_of_means_third_third.csv", index=False)

## Compare the real and the pseudo LSM values

In [54]:
def compare_with_original(original_df, pseudo_df):
    if len(original_df) != 1:
        raise ValueError("Original DataFrame must contain exactly one row")
    
    higher_counts = {col: 0 for col in original_df.columns}
    original_values = original_df.iloc[0]
    for _, pseudo_row in pseudo_df.iterrows():
        for column in original_df.columns:
            if pseudo_row[column] >= original_values[column]:
                higher_counts[column] += 1
    result_df = pd.DataFrame({
        'Feature': list(higher_counts.keys()),
        'Higher_than_original': list(higher_counts.values())
    })
    
    return result_df

In [32]:
whole_corpus_lsm_compare = compare_with_original(whole_data_lsm_means_transformed, pseudo_whole_corpus_lsm_means)
baseline_lsm_compare = compare_with_original(baseline_lsm_means_transformed, pseudo_baseline_lsm_means)
unimodal_lsm_compare = compare_with_original(unimodal_lsm_means_transformed, pseudo_unimodal_lsm_means)
unfamiliar_lsm_compare = compare_with_original(unfamiliar_lsm_means_transformed, pseudo_unfamiliar_lsm_means)
first_third_lsm_compare = compare_with_original(first_third_lsm_means_transformed, pseudo_first_third_lsm_means)
second_third_lsm_compare = compare_with_original(second_third_lsm_means_transformed, pseudo_second_third_lsm_means)
third_third_lsm_compare = compare_with_original(third_third_lsm_means_transformed, pseudo_third_third_lsm_means)

In [53]:
proba = pd.read_csv('../results/lsm_results_adapted.csv')
proba_mean = round(proba.mean(), 2)
proba_means = pd.DataFrame(proba_mean).reset_index()
proba_means.columns = ['Feature', 'Real_Mean']
proba_means_transformed = pd.DataFrame({
    row['Feature']: [row['Real_Mean']]
    for _, row in proba_means.iterrows()
})
proba_means_transformed.rename(columns={"pair_no": "sample_no"}, inplace=True)

In [56]:
whole_corpus_lsm_adapted_compare = compare_with_original(whole_data_lsm_means_adapted_transformed, pseudo_whole_corpus_lsm_adapted_means)
baseline_lsm_adapted_compare = compare_with_original(baseline_lsm_means_adapted_transformed, pseudo_baseline_lsm_adapted_means)
unimodal_lsm_adapted_compare = compare_with_original(unimodal_lsm_means_adapted_transformed, pseudo_unimodal_lsm_adapted_means)
unfamiliar_lsm_adapted_compare = compare_with_original(unfamiliar_lsm_means_adapted_transformed, pseudo_unfamiliar_lsm_adapted_means)
first_third_lsm_apated_compare = compare_with_original(first_third_lsm_means_adapted_transformed, pseudo_first_third_lsm_adapted_means)
second_third_lsm_adapted_compare = compare_with_original(second_third_lsm_means_adapted_transformed, pseudo_second_third_lsm_adapted_means)
third_third_lsm_adapted_compare = compare_with_original(third_third_lsm_means_adapted_transformed, pseudo_third_third_lsm_adapted_means)


In [40]:
whole_corpus_lsm_compare.to_csv("../results/whole_corpus_lsm_compare.csv", index=False)
baseline_lsm_compare.to_csv("../results/baseline_lsm_compare.csv", index=False)
unimodal_lsm_compare.to_csv("../results/unimodal_lsm_compare.csv", index=False)
unfamiliar_lsm_compare.to_csv("../results/unfamiliar_lsm_compare.csv", index=False)
first_third_lsm_compare.to_csv("../results/first_third_lsm_compare.csv", index=False)
second_third_lsm_compare.to_csv("../results/second_third_lsm_compare.csv", index=False)
third_third_lsm_compare.to_csv("../results/third_third_lsm_compare.csv", index=False)



In [159]:
whole_corpus_lsm_adapted_compare.to_csv("../results/whole_corpus_lsm_adapted_compare.csv", index=False)
baseline_lsm_adapted_compare.to_csv("../results/baseline_lsm_adapted_compare.csv", index=False)
unimodal_lsm_adapted_compare.to_csv("../results/unimodal_lsm_adapted_compare.csv", index=False)
unfamiliar_lsm_adapted_compare.to_csv("../results/unfamiliar_lsm_adapted_compare.csv", index=False)
first_third_lsm_apated_compare.to_csv("../results/first_third_lsm_apated_compare.csv", index=False)
second_third_lsm_adapted_compare.to_csv("../results/second_third_lsm_adapted_compare.csv", index=False)
third_third_lsm_adapted_compare.to_csv("../results/third_third_lsm_adapted_compare.csv", index=False)

In [139]:
whole_compare = pd.read_csv("../results/whole_corpus_lsm_compare.csv")
whole_pseudo_mean = pd.read_csv("../results/mean_of_means_whole_corpus.csv")
LSM_whole = pd.merge(whole_compare, whole_data_lsm_means, on='Feature', how='left')
LSM_whole_compare = pd.merge(LSM_whole, whole_pseudo_mean, on='Feature', how='left')
LSM_whole_compare.to_csv("../results/LSM_whole_compare.csv", index=False)

In [140]:
baseline_compare = pd.read_csv("../results/baseline_lsm_compare.csv")
baseline_pseudo_mean = pd.read_csv("../results/mean_of_means_baseline.csv")
LSM_baseline = pd.merge(baseline_compare, baseline_lsm_means, on='Feature', how='left')
LSM_baseline_compare = pd.merge(LSM_baseline, baseline_pseudo_mean, on='Feature', how='left')
LSM_baseline_compare.to_csv("../results/LSM_baseline_compare.csv", index=False)


In [141]:
unimodal_compare = pd.read_csv("../results/unimodal_lsm_compare.csv")
unimodal_pseudo_mean = pd.read_csv("../results/mean_of_means_unimodal.csv")
LSM_unimodal = pd.merge(unimodal_compare, unimodal_lsm_means, on='Feature', how='left')
LSM_unimodal_compare = pd.merge(LSM_unimodal, unimodal_pseudo_mean, on='Feature', how='left')
LSM_unimodal_compare.to_csv("../results/LSM_unimodal_compare.csv", index=False)

In [142]:
unfamiliar_compare = pd.read_csv("../results/unfamiliar_lsm_compare.csv")
unfamiliar_pseudo_mean = pd.read_csv("../results/mean_of_means_unfamiliar.csv")
LSM_unfamiliar = pd.merge(unfamiliar_compare, unfamiliar_lsm_means, on='Feature', how='left')
LSM_unfamiliar_compare = pd.merge(LSM_unfamiliar, unfamiliar_pseudo_mean, on='Feature', how='left')
LSM_unfamiliar_compare.to_csv("../results/LSM_unfamiliar_compare.csv", index=False)


In [143]:
first_third_compare = pd.read_csv("../results/first_third_lsm_compare.csv")
first_third_pseudo_mean = pd.read_csv("../results/mean_of_means_first_third.csv")
LSM_first_third = pd.merge(first_third_compare, first_third_lsm_means, on='Feature', how='left')
LSM_first_third_compare = pd.merge(LSM_first_third, first_third_pseudo_mean, on='Feature', how='left')
LSM_first_third_compare.to_csv("../results/LSM_first_third_compare.csv", index=False)

In [153]:
second_third_compare = pd.read_csv("../results/second_third_lsm_compare.csv")
second_third_pseudo_mean = pd.read_csv("../results/mean_of_means_second_third.csv")
LSM_second_third = pd.merge(second_third_compare, second_third_lsm_means, on='Feature', how='left')
LSM_second_third_compare = pd.merge(LSM_second_third, second_third_pseudo_mean, on='Feature', how='left')
LSM_second_third_compare.to_csv("../results/LSM_second_third_compare.csv", index=False)


In [144]:
third_third_compare = pd.read_csv("../results/third_third_lsm_compare.csv")
third_third_pseudo_mean = pd.read_csv("../results/mean_of_means_third_third.csv")
LSM_third_third = pd.merge(third_third_compare, third_third_lsm_means, on='Feature', how='left')
LSM_third_third_compare = pd.merge(LSM_third_third, third_third_pseudo_mean, on='Feature', how='left')
LSM_third_third_compare.to_csv("../results/LSM_third_third_compare.csv", index=False)

In [160]:
whole_compare_adapted = pd.read_csv("../results/whole_corpus_lsm_adapted_compare.csv")
whole_pseudo_mean_adapted = pd.read_csv("../results/mean_of_means_whole_corpus_adapted.csv")
LSM_whole_adapted = pd.merge(whole_compare_adapted, whole_data_lsm_means_adapted, on='Feature', how='left')
LSM_whole_adapted_compare = pd.merge(LSM_whole_adapted, whole_pseudo_mean_adapted, on='Feature', how='left')
LSM_whole_adapted_compare.to_csv("../results/LSM_whole_adapted_compare.csv", index=False)

In [161]:
baseline_compare_adapted = pd.read_csv("../results/baseline_lsm_adapted_compare.csv")
baseline_pseudo_mean_adapted = pd.read_csv("../results/mean_of_means_baseline_adapted.csv")
LSM_baseline_adapted = pd.merge(baseline_compare_adapted, baseline_lsm_means_adapted, on='Feature', how='left')
LSM_baseline_adapted_compare = pd.merge(LSM_baseline_adapted, baseline_pseudo_mean_adapted, on='Feature', how='left')
LSM_baseline_adapted_compare.to_csv("../results/LSM_baseline_adapted_compare.csv", index=False)

In [162]:
unimodal_compare_adapted = pd.read_csv("../results/unimodal_lsm_adapted_compare.csv")
unimodal_pseudo_mean_adapted = pd.read_csv("../results/mean_of_means_unimodal_adapted.csv")
LSM_unimodal_adapted = pd.merge(unimodal_compare_adapted, unimodal_lsm_means_adapted, on='Feature', how='left')
LSM_unimodal_adapted_compare = pd.merge(LSM_unimodal_adapted, unimodal_pseudo_mean_adapted, on='Feature', how='left')
LSM_unimodal_adapted_compare.to_csv("../results/LSM_unimodal_adapted_compare.csv", index=False)


In [163]:
unfamiliar_compare_adapted = pd.read_csv("../results/unfamiliar_lsm_adapted_compare.csv")
unfamiliar_pseudo_mean_adapted = pd.read_csv("../results/mean_of_means_unfamiliar_adapted.csv")
LSM_unfamiliar_adapted = pd.merge(unfamiliar_compare_adapted, unfamiliar_lsm_means_adapted, on='Feature', how='left')
LSM_unfamiliar_adapted_compare = pd.merge(LSM_unfamiliar_adapted, unfamiliar_pseudo_mean_adapted, on='Feature', how='left')
LSM_unfamiliar_adapted_compare.to_csv("../results/LSM_unfamiliar_adapted_compare.csv", index=False)

In [164]:
first_third_compare_adapted = pd.read_csv("../results/first_third_lsm_apated_compare.csv")
first_third_pseudo_mean_adapted = pd.read_csv("../results/mean_of_means_first_third_adapted.csv")
LSM_first_third_adapted = pd.merge(first_third_compare_adapted, first_third_lsm_means_adapted, on='Feature', how='left')
LSM_first_third_adapted_compare = pd.merge(LSM_first_third_adapted, first_third_pseudo_mean_adapted, on='Feature', how='left')
LSM_first_third_adapted_compare.to_csv("../results/LSM_first_third_adapted_compare.csv", index=False)


In [165]:
second_third_compare_adapted = pd.read_csv("../results/second_third_lsm_adapted_compare.csv")
second_third_pseudo_mean_adapted = pd.read_csv("../results/mean_of_means_second_third_adapted.csv")
LSM_second_third_adapted = pd.merge(second_third_compare_adapted, second_third_lsm_means_adapted, on='Feature', how='left')
LSM_second_third_adapted_compare = pd.merge(LSM_second_third_adapted, second_third_pseudo_mean_adapted, on='Feature', how='left')
LSM_second_third_adapted_compare.to_csv("../results/LSM_second_third_adapted_compare.csv", index=False)

In [166]:
third_third_compare_adapted = pd.read_csv("../results/third_third_lsm_adapted_compare.csv")
third_third_pseudo_mean_adapted = pd.read_csv("../results/mean_of_means_third_third_adapted.csv")
LSM_third_third_adapted = pd.merge(third_third_compare_adapted, third_third_lsm_means_adapted, on='Feature', how='left')
LSM_third_third_adapted_compare = pd.merge(LSM_third_third_adapted, third_third_pseudo_mean_adapted, on='Feature', how='left')
LSM_third_third_adapted_compare.to_csv("../results/LSM_third_third_adapted_compare.csv", index=False)
