In [None]:
import pandas as pd
import numpy as np
import ast
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr

In [None]:
consistency_df = pd.read_csv('drive/MyDrive/Colab Notebooks/consistency_df_new.csv')

In [None]:
consistency_df

Unnamed: 0.1,Unnamed: 0,imageID,memorability_scores,comments,labels
0,0,1019o64,0.901658,"['tower', 'spire', 'security', 'bitch']","['cloud', 'sky', 'skyscraper', 'building', 'da..."
1,1,103fe1z,0.832711,"['kid', 'leg', 'lava', 'hour', 'week', 'bump',...","['water', 'sky', 'azure', 'beach', 'body', 'wa..."
2,2,1047lma,0.715258,"['library', 'art', 'videogame', 'library', 'ca...","['shoe', 'furniture', 'chair', 'shelf', 'autom..."
3,3,10an4jy,0.757373,"['record', 'player', 'sight', 'friend', 'town'...","['bookcase', 'shelf', 'publication', 'book', '..."
4,4,10e1w4i,0.600920,"['view', 'today', 'flight', 'home', 'reason', ...","['cloud', 'skyscraper', 'building', 'atmospher..."
...,...,...,...,...,...
561,561,z0hsea,0.903142,"['place', 'pickup', 'truck', 'guess', 'chevy',...","['wheel', 'tire', 'car', 'vehicle', 'property'..."
562,562,z9n3qv,0.697764,"['information', 'medium']","['people', 'nature', 'art', 'road', 'surface',..."
563,563,zc90qj,0.538179,"['rock', 'capital', 'head']","['skyscraper', 'building', 'sky', 'nature', 't..."
564,564,zg9dc6,0.829435,"['spacetime', 'movement', 'planet', 'planet', ...","['automotive', 'design', 'gas', 'circle', 'cam..."


In [None]:
def load_glove_embeddings(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Function to convert string representations of lists into actual lists
def convert_string_to_list(string_list):
    try:
        return ast.literal_eval(string_list)
    except Exception as e:
        print(f"Error converting string to list: {e}")
        return []

# Function to get a word vector from the embeddings
def get_word_vector(word, embeddings_index):
    return embeddings_index.get(word, np.zeros(100))

# Function to compute the cosine similarity between a word and a list of words
def max_similarity(word, words, embeddings_index):
    word_vec = get_word_vector(word, embeddings_index)
    words_vecs = [get_word_vector(w, embeddings_index) for w in words]
    if np.all(word_vec == 0) or not any(np.any(vec) for vec in words_vecs):
        return 0
    return max(cosine_similarity([word_vec], words_vecs)[0])

# Function to compute the average of maximum cosine similarities
def average_max_similarity(comments, labels, embeddings_index):
    if not comments:
        return 0
    similarities = [max_similarity(word, labels, embeddings_index) for word in comments]
    return np.mean(similarities)



In [None]:
# Apply this conversion to the entire DataFrame
consistency_df['comments'] = consistency_df['comments'].apply(convert_string_to_list)
consistency_df['labels'] = consistency_df['labels'].apply(convert_string_to_list)


In [None]:
# Check a few entries to ensure conversion went well
print(consistency_df['comments'].head())
print(consistency_df['labels'].head())


0                      [tower, spire, security, bitch]
1    [kid, leg, lava, hour, week, bump, leg, itch, ...
2    [library, art, videogame, library, card, libra...
3    [record, player, sight, friend, town, time, di...
4       [view, today, flight, home, reason, landscape]
Name: comments, dtype: object
0    [cloud, sky, skyscraper, building, daytime, to...
1    [water, sky, azure, beach, body, water, aqua, ...
2    [shoe, furniture, chair, shelf, automotive, de...
3    [bookcase, shelf, publication, book, wood, she...
4    [cloud, skyscraper, building, atmosphere, sky,...
Name: labels, dtype: object


In [None]:
glove_file = 'drive/MyDrive/Colab Notebooks/glove.6B.100d.txt'
embeddings_index = load_glove_embeddings(glove_file)

In [None]:
consistency_df['similarity'] = consistency_df.apply(
    lambda row: average_max_similarity(row['comments'], row['labels'], embeddings_index), axis=1)

consistency_df[['comments', 'labels', 'similarity']]


Unnamed: 0,comments,labels,similarity
0,"[tower, spire, security, bitch]","[cloud, sky, skyscraper, building, daytime, to...",0.589637
1,"[kid, leg, lava, hour, week, bump, leg, itch, ...","[water, sky, azure, beach, body, water, aqua, ...",0.445397
2,"[library, art, videogame, library, card, libra...","[shoe, furniture, chair, shelf, automotive, de...",0.442866
3,"[record, player, sight, friend, town, time, di...","[bookcase, shelf, publication, book, wood, she...",0.474175
4,"[view, today, flight, home, reason, landscape]","[cloud, skyscraper, building, atmosphere, sky,...",0.568526
...,...,...,...
561,"[place, pickup, truck, guess, chevy, window, p...","[wheel, tire, car, vehicle, property, sky, aut...",0.599750
562,"[information, medium]","[people, nature, art, road, surface, wood, tru...",0.480083
563,"[rock, capital, head]","[skyscraper, building, sky, nature, tower, tow...",0.448961
564,"[spacetime, movement, planet, planet, surface,...","[automotive, design, gas, circle, camera, lens...",0.457504


In [None]:
correlation, p_value = spearmanr(consistency_df['memorability_scores'], consistency_df['similarity'])
print(f'Spearman correlation between consistency and memorability: {correlation}, p-value: {p_value}')

Spearman correlation between consistency and memorability: -0.11604170620981664, p-value: 0.005710464745265798


In [None]:
consistency_df.to_csv('consistency_analysis.csv')

In [None]:
def compute_similarities(comments, labels, embeddings_index):
    similarities = {}
    for word in comments:
        similarities[word] = {}
        for label in labels:
            similarity = cosine_similarity([get_word_vector(word, embeddings_index)], [get_word_vector(label, embeddings_index)])[0][0]
            similarities[word][label] = similarity
    return similarities

In [None]:
# Define the two word lists
list1 = ["object", "sky", "vacation"]
list2 = ['cloud', 'water', 'sky', 'atmosphere', 'daytime', 'afterglow', 'light', 'amber', 'fluid', 'orange']

# Calculate the similarity scores
similarity_scores = compute_similarities(list1, list2, embeddings_index)

# Print the similarity scores
for word in similarity_scores:
    print(f"Similarities for {word}:")
    for label in similarity_scores[word]:
        print(f"  {label}: {similarity_scores[word][label]}")


Similarities for object:
  cloud: 0.4817318022251129
  water: 0.2838169038295746
  sky: 0.36714741587638855
  atmosphere: 0.4385511875152588
  daytime: 0.1006302610039711
  afterglow: 0.06649301201105118
  light: 0.44459110498428345
  amber: 0.1607474982738495
  fluid: 0.32798853516578674
  orange: 0.1570952981710434
Similarities for sky:
  cloud: 0.6198914647102356
  water: 0.4726109802722931
  sky: 0.9999998807907104
  atmosphere: 0.44560185074806213
  daytime: 0.4167661666870117
  afterglow: 0.21294555068016052
  light: 0.6485516428947449
  amber: 0.3992052376270294
  fluid: 0.12379897385835648
  orange: 0.4674420654773712
Similarities for vacation:
  cloud: 0.14930683374404907
  water: 0.2242138385772705
  sky: 0.2527848482131958
  atmosphere: 0.26200273633003235
  daytime: 0.27166759967803955
  afterglow: -0.015036409720778465
  light: 0.14097994565963745
  amber: 0.05887573957443237
  fluid: -0.036901406943798065
  orange: 0.11777613312005997


In [None]:
consistency_df['number_comments'] = consistency_df['comments'].apply(len)
consistency_df

Unnamed: 0.1,Unnamed: 0,imageID,memorability_scores,comments,labels,similarity,number_comments
0,0,1019o64,0.901658,"[tower, spire, security, bitch]","[cloud, sky, skyscraper, building, daytime, to...",0.589637,4
1,1,103fe1z,0.832711,"[kid, leg, lava, hour, week, bump, leg, itch, ...","[water, sky, azure, beach, body, water, aqua, ...",0.445397,38
2,2,1047lma,0.715258,"[library, art, videogame, library, card, libra...","[shoe, furniture, chair, shelf, automotive, de...",0.442866,13
3,3,10an4jy,0.757373,"[record, player, sight, friend, town, time, di...","[bookcase, shelf, publication, book, wood, she...",0.474175,11
4,4,10e1w4i,0.600920,"[view, today, flight, home, reason, landscape]","[cloud, skyscraper, building, atmosphere, sky,...",0.568526,6
...,...,...,...,...,...,...,...
561,561,z0hsea,0.903142,"[place, pickup, truck, guess, chevy, window, p...","[wheel, tire, car, vehicle, property, sky, aut...",0.599750,19
562,562,z9n3qv,0.697764,"[information, medium]","[people, nature, art, road, surface, wood, tru...",0.480083,2
563,563,zc90qj,0.538179,"[rock, capital, head]","[skyscraper, building, sky, nature, tower, tow...",0.448961,3
564,564,zg9dc6,0.829435,"[spacetime, movement, planet, planet, surface,...","[automotive, design, gas, circle, camera, lens...",0.457504,12


In [None]:
rho, p = spearmanr(consistency_df['number_comments'], consistency_df['memorability_scores'])
print(f'Spearman correlation between number of comments and memorability: {rho}, p-value: {p}')

Spearman correlation between number of comments and memorability: 0.11721213520068129, p-value: 0.005237846478640651


In [None]:
pip install pingouin

Collecting pingouin
  Downloading pingouin-0.5.5-py3-none-any.whl.metadata (19 kB)
Collecting pandas-flavor (from pingouin)
  Downloading pandas_flavor-0.6.0-py3-none-any.whl.metadata (6.3 kB)
Downloading pingouin-0.5.5-py3-none-any.whl (204 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.4/204.4 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pandas_flavor-0.6.0-py3-none-any.whl (7.2 kB)
Installing collected packages: pandas-flavor, pingouin
Successfully installed pandas-flavor-0.6.0 pingouin-0.5.5


In [None]:
import pandas as pd
import pingouin as pg

partial_corr = pg.partial_corr(data=consistency_df,
                               x='memorability_scores',
                               y='similarity',
                               covar='number_comments',
                               method='spearman')

print(partial_corr)


            n         r           CI95%    p-val
spearman  566 -0.127109  [-0.21, -0.05]  0.00247
