In [7]:
import pickle
import pandas as pd
from scipy.stats import pearsonr
from scipy.stats import spearmanr
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import scipy.stats as stats

In [16]:
whole_data_df = pd.read_csv('../results/linguistic_analysis_condition_mordor_gondor_124.csv')
baseline_df = pd.read_csv('../results/baseline_linguistic_analysis_124.csv')
unimodal_df = pd.read_csv('../results/unimodal_linguistic_analysis_124.csv')
unfamiliar_df = pd.read_csv('../results/unfamiliar_linguistic_analysis_124.csv')

pseudo_whole_data_df = pd.read_csv('../results/pseudo_pairs_whole_corpus_linguistic_analysis.csv')
pseudo_baseline_df = pd.read_csv('../results/pseudo_pairs_baseline_linguistic_analysis.csv')
pseudo_unimodal_df = pd.read_csv('../results/pseudo_pairs_unimodal_linguistic_analysis.csv')
pseudo_unfamiliar_df = pd.read_csv('../results/pseudo_pairs_unfamiliar_linguistic_analysis.csv')

first_third_df = pd.read_csv('../results/linguistic_analysis_condition_mordor_gondor_first_third_124.csv')
second_third_df = pd.read_csv('../results/linguistic_analysis_condition_mordor_gondor_second_third_124.csv')
third_third_df = pd.read_csv('../results/linguistic_analysis_condition_mordor_gondor_third_third_124.csv')

In [17]:
pseudo_first_third_df = pd.read_csv('../results/pseudo_pairs_first_third_linguistic_analysis.csv')
pseudo_second_third_df = pd.read_csv('../results/pseudo_pairs_second_third_linguistic_analysis.csv')
pseudo_third_third_df = pd.read_csv('../results/pseudo_pairs_third_third_linguistic_analysis.csv')

## Calculate correlation

In [26]:
def Correlations(df_name):
    columns = [
        "lexical_diversity", "speech_ratio", "word_avg", "hes", "laugh", "question", "hum",
        "s_first_person", "s_second_person", "s_third_person", "p_first_person", "p_second_person",
        "p_third_person", "present", "past", "future", "cond", "verb", "noun", "adj", "adv", "num",
        "article", "preverb", "postverb", "pronoun", "conjunction", "interjection", "det", "funct_words",
        "negation", "happiness", "sadness", "anger", "fear", "disgust", "no_emotion", "positive",
        "negative", "neutral"]
    results = []
    for col in columns:
        corr_pearson, p_value_pearson = pearsonr(df_name[f"{col}_mordor"], df_name[f"{col}_gondor"])
        corr_spearman, p_value_spearman = spearmanr(df_name[f"{col}_mordor"], df_name[f"{col}_gondor"])
        results.append({
            "Feature": f"{col}",
            #"Correlation_spearman": corr_spearman,
            "Correlation_pearson": round(corr_pearson, 2),
            "P-Value": round(p_value_pearson, 2),
            "P-Value_<_0.005": p_value_pearson < 0.005
        })

    df_correlations = pd.DataFrame(results)
    # df_correlations_filtered = df_correlations[df_correlations["P-Value < 0.005"] == True]
    return df_correlations

In [27]:
whole_data_correlations = Correlations(whole_data_df)
baseline_correlations = Correlations(baseline_df)
unimodal_correlations = Correlations(unimodal_df)
unfamiliar_correlations = Correlations(unfamiliar_df)
pseudo_whole_data_correlations = Correlations(pseudo_whole_data_df)
pseudo_baseline_correlations = Correlations(pseudo_baseline_df)
pseudo_unimodal_correlations = Correlations(pseudo_unimodal_df)
pseudo_unfamiliar_correlations = Correlations(pseudo_unfamiliar_df)


In [28]:
first_third_correlations = Correlations(first_third_df)
second_third_correlations = Correlations(second_third_df)
third_third_correlations = Correlations(third_third_df)

In [29]:
pseudo_first_third_correlations = Correlations(pseudo_first_third_df)
pseudo_second_third_correlations = Correlations(pseudo_second_third_df)
pseudo_third_third_correlations = Correlations(pseudo_third_third_df)

In [31]:
whole_data_correlations.to_csv("../results/correlations_whole_corpus.csv", index=False)
baseline_correlations.to_csv("../results/correlations_baseline.csv", index = False)
unimodal_correlations.to_csv("../results/correlations_unimodal.csv", index = False)
unfamiliar_correlations.to_csv("../results/correlations_unfamiliar.csv", index = False)
pseudo_whole_data_correlations.to_csv("../results/correlations_pseudo_whole_corpus.csv", index = False)
pseudo_baseline_correlations.to_csv("../results/correlations_pseudo_baseline.csv", index = False)
pseudo_unimodal_correlations.to_csv("../results/correlations_pseudo_unimodal.csv", index = False)
pseudo_unfamiliar_correlations.to_csv("../results/correlations_pseudo_unfamiliar.csv", index = False)

In [32]:
first_third_correlations.to_csv("../results/correlations_first_third.csv", index = False)
second_third_correlations.to_csv("../results/correlations_second_third.csv", index = False)
third_third_correlations.to_csv("../results/correlations_third_third.csv", index = False)

In [33]:
pseudo_first_third_correlations.to_csv("../results/correlations_pseudo_first_third.csv", index = False)
pseudo_second_third_correlations.to_csv("../results/correlations_pseudo_second_third.csv", index = False)
pseudo_third_third_correlations.to_csv("../results/correlations_pseudo_third_third.csv", index = False)

In [34]:
new_dataframe = pd.merge(whole_data_correlations, baseline_correlations, on="Feature", suffixes=('_whole', '_baseline'))
new_dataframe = pd.merge(new_dataframe, unimodal_correlations, on="Feature", suffixes=('', '_unimod'))
new_dataframe = pd.merge(new_dataframe, unfamiliar_correlations, on="Feature", suffixes=('', '_unfamiliar'))
new_dataframe = pd.merge(new_dataframe, first_third_correlations, on="Feature", suffixes=('', '_first_third'))
new_dataframe = pd.merge(new_dataframe, second_third_correlations, on="Feature", suffixes=('', '_second_third'))
new_dataframe = pd.merge(new_dataframe, third_third_correlations, on="Feature", suffixes=('', '_third_third'))

In [35]:
new_dataframe.to_csv("../results/correlation_summary.csv", index=False)

In [47]:
correlation_summary = new_dataframe.copy()
filtered_columns = [col for col in correlation_summary.columns if "P-Value_<_0.005" in col]
filtered_columns.insert(0, "Feature")  # Keep the Feature column
correlation_summary_filtered = correlation_summary[filtered_columns]


In [54]:
correlation_summary_filtered.rename(columns={'P-Value_<_0.005_whole': 'whole_data', 'P-Value_<_0.005_baseline': 'baseline', 'P-Value_<_0.005': 'unimodal', 'P-Value_<_0.005_unfamiliar': 'unfamiliar', 'P-Value_<_0.005_first_third': 'first_third', 'P-Value_<_0.005_second_third': 'second_third', 'P-Value_<_0.005_third_third': 'third_third'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  correlation_summary_filtered.rename(columns={'P-Value_<_0.005_whole': 'whole_data', 'P-Value_<_0.005_baseline': 'baseline', 'P-Value_<_0.005': 'unimodal', 'P-Value_<_0.005_unfamiliar': 'unfamiliar', 'P-Value_<_0.005_first_third': 'first_third', 'P-Value_<_0.005_second_third': 'second_third', 'P-Value_<_0.005_third_third': 'third_third'}, inplace=True)


In [56]:
correlation_summary_filtered.to_csv("../results/correlation_summary_filtered.csv", index=False)

## Calculate linear regression

## Linear regression

In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from scipy import stats

def calculate_linear_regression(df_name):
    features = [
        "lexical_diversity", "speech_ratio", "word_avg", "hes", "laugh", "question", "hum",
        "s_first_person", "s_second_person", "s_third_person", "p_first_person", "p_second_person", 
        "p_third_person", "present", "past", "future", "cond", "verb", "noun", "adj", "adv", "num",
        "article", "preverb", "postverb", "pronoun", "conjunction", "interjection", "det", 
        "funct_words", "negation", "happiness", "sadness", "anger", "fear", "disgust", 
        "no_emotion", "positive", "negative", "neutral"
    ]
    
    results = []
    for feature in features:
        # Prepare data
        X = df_name[f"{feature}_mordor"].values.reshape(-1, 1)
        y = df_name[f"{feature}_gondor"].values
        n = len(y)
        
        # Fit linear regression
        model = LinearRegression()
        model.fit(X, y)
        
        # Calculate predictions and R-squared
        y_pred = model.predict(X)
        r2 = r2_score(y, y_pred)
        
        # Calculate standard error and t-statistic
        residuals = y - y_pred.ravel()
        mse = np.sum(residuals**2) / (n - 2)
        std_err = np.sqrt(mse / np.sum((X - np.mean(X))**2))
        t_stat = model.coef_[0] / std_err
        
        # Calculate p-value
        p_value = 2 * (1 - stats.t.cdf(abs(t_stat), n - 2))
        
        results.append({
            'Feature': feature,
            'Coefficient': model.coef_[0],
            'Intercept': model.intercept_,
            'R_squared': r2,
            'Std_error': std_err,
            'T_statistic': t_stat,
            'P_value': p_value,
            'Significant': p_value < 0.003
        })
    
    return pd.DataFrame(results)

In [7]:
regression_whole_data = calculate_linear_regression(whole_data_df)
regression_pseudo_whole_data = calculate_linear_regression(pseudo_whole_data_df)
regression_pseudo_baseline = calculate_linear_regression(pseudo_baseline_df)
regression_baseline = calculate_linear_regression(baseline_df)
regression_pseudo_unimodal = calculate_linear_regression(pseudo_unimodal_df)
regression_unimodal = calculate_linear_regression(unimodal_df)
regression_pseudo_unfamiliar = calculate_linear_regression(pseudo_unfamiliar_df)
regression_unfamiliar = calculate_linear_regression(unfamiliar_df)
regression_first_third = calculate_linear_regression(first_third_df)
regression_second_third = calculate_linear_regression(second_third_df)
regression_third_third = calculate_linear_regression(third_third_df)
regression_pseudo_first_third = calculate_linear_regression(pseudo_first_third_df)
regression_pseudo_second_third = calculate_linear_regression(pseudo_second_third_df)
regression_pseudo_third_third = calculate_linear_regression(pseudo_third_third_df)

In [8]:
regression_whole_data.to_csv("../results/whole_data_regression.csv", index=False)
regression_pseudo_whole_data.to_csv("../results/pseudo_whole_data_regression.csv", index=False)
regression_baseline.to_csv("../results/baseline_regression.csv", index=False)
regression_pseudo_baseline.to_csv("../results/pseudo_baseline_regression.csv", index=False)
regression_unimodal.to_csv("../results/unimodal_regression.csv", index=False)
regression_pseudo_unimodal.to_csv("../results/pseudo_unimodal_regression.csv", index=False)
regression_unfamiliar.to_csv("../results/unfamiliar_regression.csv", index=False)
regression_pseudo_unfamiliar.to_csv("../results/pseudo_unfamiliar_regression.csv", index=False)
regression_first_third.to_csv("../results/first_third_regression.csv", index=False)
regression_second_third.to_csv("../results/second_third_regression.csv", index=False)
regression_third_third.to_csv("../results/third_third_regression.csv", index=False)
regression_pseudo_first_third.to_csv("../results/pseudo_first_third_regression.csv", index=False)
regression_pseudo_second_third.to_csv("../results/pseudo_second_third_regression.csv", index=False)
regression_pseudo_third_third.to_csv("../results/pseudo_third_third_regression.csv", index=False)



## Calculate LSM

In [17]:
def lsm_count(df_name):
    lsm_results = []
    for index, row in df_name.iterrows():
        lsm_adverb = 1 - ((abs(row['adv_gondor'] - row['adv_mordor'])) / (row['adv_gondor'] + row['adv_mordor'] + 0.00001))
        lsm_article = 1 - ((abs(row['article_gondor'] - row['article_mordor'])) / (row['article_gondor'] + row['article_mordor'] + 0.00001))
        lsm_preverb = 1 - ((abs(row['preverb_gondor'] - row['preverb_mordor'])) / (row['preverb_gondor'] + row['preverb_mordor'] + 0.00001))
        lsm_postverb = 1 - ((abs(row['postverb_gondor'] - row['postverb_mordor'])) / (row['postverb_gondor'] + row['postverb_mordor'] + 0.00001))
        lsm_pronoun = 1 - ((abs(row['pronoun_gondor'] - row['pronoun_mordor'])) / (row['pronoun_gondor'] + row['pronoun_mordor'] + 0.00001))
        lsm_conjunction = 1 - ((abs(row['conjunction_gondor'] - row['conjunction_mordor'])) / (row['conjunction_gondor'] + row['conjunction_mordor'] + 0.00001))
        lsm_interjection = 1 - ((abs(row['interjection_gondor'] - row['interjection_mordor'])) / (row['interjection_gondor'] + row['interjection_mordor'] + 0.00001))
        lsm_det = 1 - ((abs(row['det_gondor'] - row['det_mordor'])) / (row['det_gondor'] + row['det_mordor'] + 0.00001))
        lsm_avg = (lsm_adverb + lsm_article + lsm_interjection + lsm_conjunction + lsm_pronoun + lsm_postverb + lsm_preverb + lsm_det)  / 8
        pair_no = int(row['PairNo'])
        lsm_results.append({"pair_no": pair_no, "lsm_article": lsm_article, "lsm_preverb": lsm_preverb, "lsm_postverb": lsm_postverb, "lsm_pronoun": lsm_pronoun, "lsm_conjunction": lsm_conjunction, "lsm_interjection": lsm_interjection, "lsm_adverb": lsm_adverb, "lsm_avg": lsm_avg})
        lsm_results_df = pd.DataFrame(lsm_results)
    return lsm_results_df

In [18]:
whole_data_lsm = lsm_count(whole_data_df)
baseline_lsm = lsm_count(baseline_df)
unimodal_lsm = lsm_count(unimodal_df)
unfamiliar_lsm = lsm_count(unfamiliar_df)
first_third_lsm = lsm_count(first_third_df)
second_third_lsm = lsm_count(second_third_df)
third_third_lsm = lsm_count(third_third_df)

In [19]:
whole_data_lsm.to_csv("../results/whole_data_lsm.csv", index=False)
baseline_lsm.to_csv("../results/baseline_lsm.csv", index=False)
unimodal_lsm.to_csv("../results/unimodal_lsm.csv", index=False)
unfamiliar_lsm.to_csv("../results/unfamiliar_lsm.csv", index=False)
first_third_lsm.to_csv("../results/first_third_lsm.csv", index=False)
second_third_lsm.to_csv("../results/second_third_lsm.csv", index=False)
third_third_lsm.to_csv("../results/third_third_lsm.csv", index=False)

## Calculate the adapted LSM

In [20]:
def lsm_count_adapted(df_name):
    lsm_results_adapted = []
    for index, row in df_name.iterrows():
        lsm_adverb = 1 - ((abs(row['adv_gondor'] - row['adv_mordor'])) / (row['adv_gondor'] + row['adv_mordor'] + 0.00001))
        lsm_article = 1 - ((abs(row['article_gondor'] - row['article_mordor'])) / (row['article_gondor'] + row['article_mordor'] + 0.00001))
        lsm_preverb = 1 - ((abs(row['preverb_gondor'] - row['preverb_mordor'])) / (row['preverb_gondor'] + row['preverb_mordor'] + 0.00001))
        lsm_postverb = 1 - ((abs(row['postverb_gondor'] - row['postverb_mordor'])) / (row['postverb_gondor'] + row['postverb_mordor'] + 0.00001))
        lsm_pronoun = 1 - ((abs(row['pronoun_gondor'] - row['pronoun_mordor'])) / (row['pronoun_gondor'] + row['pronoun_mordor'] + 0.00001))
        lsm_conjunction = 1 - ((abs(row['conjunction_gondor'] - row['conjunction_mordor'])) / (row['conjunction_gondor'] + row['conjunction_mordor'] + 0.00001))
        lsm_interjection = 1 - ((abs(row['interjection_gondor'] - row['interjection_mordor'])) / (row['interjection_gondor'] + row['interjection_mordor'] + 0.00001))
        lsm_det = 1 - ((abs(row['det_gondor'] - row['det_mordor'])) / (row['det_gondor'] + row['det_mordor'] + 0.00001))
        lsm_s_first_person = 1 - ((abs(row['s_first_person_gondor'] - row['s_first_person_mordor'])) / (row['s_first_person_gondor'] + row['s_first_person_mordor'] + 0.00001))
        lsm_s_second_person = 1 - ((abs(row['s_second_person_gondor'] - row['s_second_person_mordor'])) / (row['s_second_person_gondor'] + row['s_second_person_mordor'] + 0.00001))
        lsm_s_third_person = 1 - ((abs(row['s_third_person_gondor'] - row['s_third_person_mordor'])) / (row['s_third_person_gondor'] + row['s_third_person_mordor'] + 0.00001))
        lsm_p_first_person = 1 - ((abs(row['p_first_person_gondor'] - row['p_first_person_mordor'])) / (row['p_first_person_gondor'] + row['p_first_person_mordor'] + 0.00001))
        lsm_p_second_person = 1 - ((abs(row['p_second_person_gondor'] - row['p_second_person_mordor'])) / (row['p_second_person_gondor'] + row['p_second_person_mordor'] + 0.00001))
        lsm_p_third_person = 1 - ((abs(row['p_third_person_gondor'] - row['p_third_person_mordor'])) / (row['p_third_person_gondor'] + row['p_third_person_mordor'] + 0.00001))
        lsm_avg = (lsm_adverb + lsm_article + lsm_interjection + lsm_conjunction + lsm_pronoun + lsm_postverb + lsm_preverb + lsm_det + lsm_p_first_person + lsm_p_second_person + lsm_p_third_person + lsm_s_first_person + lsm_s_second_person + lsm_s_third_person) / 14
        pair_no = int(row['PairNo'])
        lsm_results_adapted.append({"pair_no": pair_no, "lsm_article": lsm_article, "lsm_preverb": lsm_preverb, "lsm_postverb": lsm_postverb, "lsm_pronoun": lsm_pronoun, "lsm_conjunction": lsm_conjunction, "lsm_interjection": lsm_interjection, "lsm_adverb": lsm_adverb, "lsm_avg": lsm_avg, "lsm_s_first_person": lsm_s_first_person, "lsm_s_second_person": lsm_s_second_person, "lsm_s_third_person": lsm_s_third_person, "lsm_p_first_person": lsm_p_first_person, "lsm_p_second_person": lsm_p_second_person, "lsm_p_third_person": lsm_p_third_person})
        lsm_results_df_adapted = pd.DataFrame(lsm_results_adapted)
    return lsm_results_df_adapted

In [21]:
whole_data_lsm_adapted = lsm_count_adapted(whole_data_df)
baseline_lsm_adapted = lsm_count_adapted(baseline_df)
unimodal_lsm_adapted = lsm_count_adapted(unimodal_df)
unfamiliar_lsm_adapted = lsm_count_adapted(unfamiliar_df)
first_third_lsm_adapted = lsm_count_adapted(first_third_df)
second_third_lsm_adapted = lsm_count_adapted(second_third_df)
third_third_lsm_adapted = lsm_count_adapted(third_third_df)

In [22]:
whole_data_lsm_adapted.to_csv("../results/whole_data_lsm_adapted.csv", index=False)
baseline_lsm_adapted.to_csv("../results/baseline_lsm_adapted.csv", index=False)
unimodal_lsm_adapted.to_csv("../results/unimodal_lsm_adapted.csv", index=False)
unfamiliar_lsm_adapted.to_csv("../results/unfamiliar_lsm_adapted.csv", index=False)
first_third_lsm_adapted.to_csv("../results/first_third_lsm_adapted.csv", index=False)
second_third_lsm_adapted.to_csv("../results/second_third_lsm_adapted.csv", index=False)
third_third_lsm_adapted.to_csv("../results/third_third_lsm_adapted.csv", index=False)

In [23]:
# calculate mean of lsm_avg
whole_data_mean_lsm = round(whole_data_lsm["lsm_avg"].mean(), 2)
baseline_mean_lsm = round(baseline_lsm["lsm_avg"].mean(), 2)
unimodal_mean_lsm = round(unimodal_lsm["lsm_avg"].mean(), 2)
unfamiliar_mean_lsm = round(unfamiliar_lsm["lsm_avg"].mean(), 2)
first_third_mean_lsm = round(first_third_lsm["lsm_avg"].mean(), 2)
second_third_mean_lsm = round(second_third_lsm["lsm_avg"].mean(), 2)
third_third_mean_lsm = round(third_third_lsm["lsm_avg"].mean(), 2)

In [24]:
mean_lsm_df = pd.DataFrame({
    "condition": ["whole_data", "baseline", "unimodal", "unfamiliar", "first_third", "second_third", "third_third"],
    "mean_lsm": [whole_data_mean_lsm, baseline_mean_lsm, unimodal_mean_lsm, unfamiliar_mean_lsm, first_third_df, second_third_mean_lsm, third_third_mean_lsm]
})


In [25]:
mean_lsm_df.to_csv("../results/mean_lsm.csv", index=False)

In [26]:
whole_data_mean_lsm_adapted = round(whole_data_lsm_adapted["lsm_avg"].mean(), 2)
baseline_mean_lsm_adapted = round(baseline_lsm_adapted["lsm_avg"].mean(), 2)
unimodal_mean_lsm_adapted = round(unimodal_lsm_adapted["lsm_avg"].mean(), 2)
unfamiliar_mean_lsm_adapted = round(unfamiliar_lsm_adapted["lsm_avg"].mean(), 2)
first_third_mean_lsm_adapted = round(first_third_lsm_adapted["lsm_avg"].mean(), 2)
second_third_mean_lsm_adapted = round(second_third_lsm_adapted["lsm_avg"].mean(), 2)
third_third_mean_lsm_adapted = round(third_third_lsm_adapted["lsm_avg"].mean(), 2)

In [27]:
mean_lsm_adapted_df = pd.DataFrame({
    "condition": ["whole_data", "baseline", "unimodal", "unfamiliar", "first_third", "second_third", "third_third"],
    "mean_lsm_adapted": [whole_data_mean_lsm_adapted, baseline_mean_lsm_adapted, unimodal_mean_lsm_adapted, unfamiliar_mean_lsm_adapted, first_third_mean_lsm_adapted, second_third_mean_lsm_adapted, third_third_mean_lsm_adapted]
})

In [28]:
mean_lsm_adapted_df.to_csv("../results/mean_lsm_adapted.csv", index=False)