In [10]:
# Imports
from scripts.processing import get_embeddings
from scripts.comparison import find_similar_names
from scripts.scraping import extract_texts
import json
import numpy as np
import pandas as pd
import networkx as nx
import plotly.graph_objects as go

In [None]:
# Make articles json
articles = extract_texts('data/friends.txt')

# Write articles to JSON file
with open('friend_data/friends_articles.json', 'w') as f:
    json.dump(articles, f)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processed Arya
Processed Sully
Error processing https://www.familyeducation.com/baby-names/list-ideas/the-ultimate-list-of-victorian-girl-names: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.familyeducation.com/baby-names/list-ideas/the-ultimate-list-of-victorian-girl-names on URL https://www.familyeducation.com/baby-names/list-ideas/the-ultimate-list-of-victorian-girl-names
Processed Victoria
Error processing https://www.newsweek.com/tropical-storm-erin-tracker-live-updates-hurricane-2111872: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.newsweek.com/tropical-storm-erin-tracker-live-updates-hurricane-2111872 on URL https://www.newsweek.com/tropical-storm-erin-tracker-live-updates-hurricane-2111872
Processed Erin
Processed Hailey
Processed Mike
Processed Blanton
Error processing https://www.names.org/n/hanjing/about: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.names.org/n/hanji

In [None]:
# Read articles json and make embeddings
with open('friend_data/friends_articles.json', 'r') as f:
    articles = json.load(f)

embeddings = get_embeddings(articles)

Processed Arya with 4 texts
Processed Sully with 4 texts
Processed Victoria with 4 texts
Processed Erin with 4 texts
Processed Hailey with 3 texts
Processed Mike with 5 texts
Processed Blanton with 5 texts
Processed Hanjing with 3 texts
Processed Francis with 5 texts
Processed Benjamin with 5 texts
Processed Aihan with 3 texts


In [None]:
# Convert embeddings to dict of lists for semantic and sentiment
for key in embeddings.keys():
    embeddings[key] = {
        'semantic': np.array(embeddings[key]['semantic']).tolist(),
        'sentiment': np.array(embeddings[key]['sentiment']).tolist()
    }

# Write to json file
with open('friend_data/friends_embeddings.json', 'w') as f:
    json.dump(embeddings, f, indent=4)

In [17]:
# Read embeddings from JSON file
with open('friend_data/friends_embeddings.json', 'r') as f:
    embeddings = json.load(f)

# Find similar names
target_name = "Sully"
similar_names = find_similar_names(target_name, embeddings, top_k=5 , w=0.5)

print(f"Similar names to {target_name}:")
for name, score in similar_names:
    print(f"{name}: {score:.4f}")

Similar names to Sully:
Blanton: 0.7050
Aihan: 0.6647
Hanjing: 0.6562
Victoria: 0.6285
Francis: 0.5135


In [18]:
# Calculate similarity matrix
similarity_matrix = pd.DataFrame(index=embeddings.keys(), columns=embeddings.keys())
for name1 in embeddings.keys():
    for name2 in embeddings.keys():
        if name1 != name2:
            score = np.dot(embeddings[name1]['semantic'], embeddings[name2]['semantic'])
        else:
            score = 1

        similarity_matrix.loc[name1, name2] = score

display(similarity_matrix)



Unnamed: 0,Arya,Sully,Victoria,Erin,Hailey,Mike,Blanton,Hanjing,Francis,Benjamin,Aihan
Arya,1.0,0.271086,0.304895,0.050393,-0.144093,-0.023669,0.315711,0.332736,0.184049,0.345832,0.51089
Sully,0.271086,1.0,0.284549,0.052015,-0.000515,0.017736,0.413166,0.313282,0.380739,0.345902,0.32932
Victoria,0.304895,0.284549,1.0,0.036688,0.106791,-0.025678,0.283995,0.334846,0.39563,0.309696,0.261304
Erin,0.050393,0.052015,0.036688,1.0,0.125935,0.143076,0.066186,0.053916,-0.020997,0.026255,0.046527
Hailey,-0.144093,-0.000515,0.106791,0.125935,1.0,0.107812,0.064383,0.077629,0.007813,0.092793,-0.025379
Mike,-0.023669,0.017736,-0.025678,0.143076,0.107812,1.0,0.032791,-0.076492,0.05371,0.13782,0.014916
Blanton,0.315711,0.413166,0.283995,0.066186,0.064383,0.032791,1.0,0.249334,0.330817,0.437996,0.312084
Hanjing,0.332736,0.313282,0.334846,0.053916,0.077629,-0.076492,0.249334,1.0,0.261035,0.319649,0.409574
Francis,0.184049,0.380739,0.39563,-0.020997,0.007813,0.05371,0.330817,0.261035,1.0,0.424551,0.210739
Benjamin,0.345832,0.345902,0.309696,0.026255,0.092793,0.13782,0.437996,0.319649,0.424551,1.0,0.509333


In [None]:
import networkx as nx
import plotly.graph_objects as go
import numpy as np

def plot_similarity_graph(names, similarity_matrix, threshold=0.5):
    G = nx.Graph()

    # Add nodes
    for name in names:
        G.add_node(name)

    # Add edges with weights
    for i in range(len(names)):
        for j in range(i + 1, len(names)):
            sim = similarity_matrix[i, j]
            if sim >= threshold:
                G.add_edge(names[i], names[j], weight=sim)

    pos = nx.spring_layout(G, seed=42)

    # Prepare edge coordinates and weights
    edge_x = []
    edge_y = []
    edge_weights = []

    for u, v, d in G.edges(data=True):
        x0, y0 = pos[u]
        x1, y1 = pos[v]
        edge_x += [x0, x1, None]
        edge_y += [y0, y1, None]
        edge_weights.append(d['weight'])

    # Create edge trace with color scale
    edge_trace = go.Scatter(
        x=edge_x,
        y=edge_y,
        mode='lines',
        line=dict(width=2, color=edge_weights, colorscale='Viridis', colorbar=dict(title='Weight'), cmin=min(edge_weights), cmax=max(edge_weights)),
        hoverinfo='none'
    )

    # Prepare node positions
    node_x = []
    node_y = []
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)

    node_trace = go.Scatter(
        x=node_x,
        y=node_y,
        mode='markers+text',
        text=list(G.nodes()),
        textposition="top center",
        hoverinfo='text',
        marker=dict(size=12, color='lightblue', line_width=2)
    )

    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title='Name Similarity Graph (Edges Colored by Weight)',
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=0, l=0, r=0, t=40),
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
                    ))
    fig.show()


# Example usage
names = ["Alice", "Bob", "Charlie", "David"]
similarity_matrix = np.array([
    [1.0, 0.8, 0.3, 0.6],
    [0.8, 1.0, 0.4, 0.5],
    [0.3, 0.4, 1.0, 0.7],
    [0.6, 0.5, 0.7, 1.0]
])

plot_similarity_graph(names, similarity_matrix, threshold=0.5)


In [None]:
# Make name vector, semantic and sentimnet arrays
names = np.expand_dims(np.array(list(embeddings.keys())), 1)

sentiments = np.array([embeddings[key]['sentiment'] for key in embeddings.keys()])
semantics = np.array([embeddings[key]['semantic'] for key in embeddings.keys()])

# Save the embeddings to a csv
display(names.shape)

# Concatenate the arrays and create a DataFrame
embeddings_df = np.concatenate((sentiments, semantics), axis=1)
embeddings_df = pd.DataFrame(embeddings_df, index=names.flatten())

# Write the DataFrame to a CSV file
embeddings_df.to_csv('data/embeddings.csv', header=False)

In [None]:
# Use KNN to cluster names
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=6)
knn.fit(embeddings_df)

# Find the nearest neighbors for the target name
distances, indices = knn.kneighbors(embeddings_df.loc[target_name].values.reshape(1, -1))

# Print the nearest neighbors not including the target name
print(f"\nNearest neighbors for {target_name}:")
for i, index in enumerate(indices[0]):
    neighbor_name = embeddings_df.index[index]
    distance = distances[0][i]
    if neighbor_name == target_name:
        continue  # Skip the target name itself
    print(f"{i}. {neighbor_name} (distance: {distance:.4f})")

    