# Methods and such 
Run the initialization method to begin with - this contains all the basic functions

In [19]:
# load-up the kde engine
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import plotly.graph_objects as go
from scipy.spatial.distance import cdist
from scipy.optimize import minimize
import difflib

# load the data
def load_data(path):
    df = pd.read_csv(path)

    pca_vectors = []

    for pca_string in df['PCA']:
        values = pca_string.split('\n')
        pca_vectors.append([(float(values[0].split()[1])), (float(values[1].split()[1])), (float(values[2].split()[1]))])

    pca_vectors = np.array(pca_vectors)
    points = np.array(pca_vectors)

    colors = df["density_score"]

    density_scores = df['density_score'].values

    word_list = df['Word'].tolist()

    new_word_list = []

    for word in word_list:
        new_word_list.append(str(word))

    x, y, z = [], [], []

    for pca_string in df["PCA"]:
        values = pca_string.split("\n")
        x.append(float(values[0].split()[1]))
        y.append(float(values[1].split()[1]))
        z.append(float(values[2].split()[1]))

    return points, colors, density_scores, new_word_list, x, y, z

# Joke stuff generator
def generate_initial_direction():
    direction = np.random.rand(3) - 0.5  # Random direction
    return direction / np.linalg.norm(direction)  # Normalize

def score_direction(points, starting_point, direction, density_scores, tolerance=0.01):
    vectors = points - starting_point
    projections = np.dot(vectors, direction) / np.linalg.norm(direction)
    distances = np.linalg.norm(vectors - np.outer(projections, direction), axis=1)
    close_points = distances < tolerance
    score = np.sum(density_scores[close_points])
    return score, close_points

def find_optimal_direction(points, starting_point, density_scores, iterations=100, tolerance=0.1):
    best_direction = generate_initial_direction()
    best_score, best_indices = score_direction(points, starting_point, best_direction, density_scores, tolerance)

    for i in range(iterations):
        # Gradually decrease the mutation size to allow finer adjustments
        mutation_size = 0.1
        mutation = best_direction + (np.random.rand(3) - 0.5) * mutation_size
        mutation = mutation / np.linalg.norm(mutation)  # Normalize

        new_score, new_indices = score_direction(points, starting_point, mutation, density_scores, tolerance)

        # this tends to settle in a local maximum
        if new_score > best_score:
            best_score = new_score
            best_direction = mutation
            best_indices = new_indices


    return best_direction, best_indices

def search_for_word(word_index, points, density_scores, iterations=100, tolerance=0.1):
    starting_point = points[word_index]
    # find the optimal direction
    result, close_points = find_optimal_direction(points, starting_point, density_scores, iterations=iterations, tolerance=tolerance)
    # return the right values 
    return result, close_points

def find_closest_word_index(word_list, input_word):
    # Use difflib to find the closest match to the input word from the word list.
    closest_matches = difflib.get_close_matches(input_word, word_list, n=1, cutoff=0)
    
    # Check if there's at least one close match.
    if closest_matches:
        closest_word = closest_matches[0]
        # Return the index of the closest match in the original list.
        return word_list.index(closest_word)
    else:
        return "No close match found."
    
def draw_eight_ball(x, y, z, colors, starting_point, result):
    # Create a 3D scatter plot of the PCA points
    scatter = go.Scatter3d(
        x=x,
        y=y,
        z=z,
        mode='markers',
        marker=dict(
            size=5,          # Marker size
            color=colors,    # Set color to the density_score
            colorscale='Viridis', # Color scale
            colorbar=dict(title='Density Score'),
            opacity=0.1
        )
    )

    # Define the end point of the line for the initial direction vector
    line_end_point = starting_point + result * 2  # Adjust the multiplier as needed for visibility

    # Create a line representing the initial direction
    line = go.Scatter3d(
        x=[starting_point[0], line_end_point[0]],
        y=[starting_point[1], line_end_point[1]],
        z=[starting_point[2], line_end_point[2]],
        mode='lines',
        line=dict(
            color='red',
            width=5
        )
    )

    starting_point = go.Scatter3d(x=[starting_point[0]], y=[starting_point[1]], z=[starting_point[2]], mode='markers', marker=dict(size=10, color='black'))

    # Define layout for the plot
    layout = go.Layout(
        title='PCA Points and Initial Direction',
        scene=dict(
            xaxis_title='PCA 1',
            yaxis_title='PCA 2',
            zaxis_title='PCA 3'
        ),
        margin=dict(l=0, r=0, b=0, t=0)
    )

    # Create the figure and add the scatter and line
    fig = go.Figure(data=[scatter, line, starting_point], layout=layout)

    # Show the plot
    fig.show()

def display_words_in_columns(words_list, num_columns=4):
    """
    Displays a list of words in a DataFrame formatted with the specified number of columns.
    Uses 'display' in Jupyter environments and falls back to 'print' elsewhere.
    
    :param words_list: List of words to be displayed.
    :param num_columns: Number of columns in the display DataFrame.
    """
    # Ensure there's at least one column
    num_columns = max(1, num_columns)
    
    # Create an empty DataFrame with the desired structure
    df_structure = {f"Column {i+1}": pd.Series(dtype=str) for i in range(num_columns)}
    df = pd.DataFrame(df_structure)
    
    # Populate the DataFrame with the words
    for i, word in enumerate(words_list):
        column_name = f"Column {(i % num_columns) + 1}"
        df.at[i // num_columns, column_name] = word
    
    display(df.fillna(""))



def shake_eight_ball(word_list, points, density_scores, x, y, z, iterations=100, tolerance=0.1):
    input_word = input("Enter a word: ")
    # use this to find the closest word
    word_index = find_closest_word_index(word_list, input_word)
    # search for the word
    if word_index != "No close match found.":
        result, close_points = search_for_word(word_index, points, density_scores, iterations=iterations, tolerance=tolerance)
        print("The closest word is: ", word_list[word_index])
        print("The direction is: ", result)
        print("The close points are: ", close_points)
        # filter all the words that are close to the word
        close_words = [word_list[i] for i in range(len(close_points)) if close_points[i]]
        # then draw out the eight ball results
        draw_eight_ball(x, y, z, density_scores, points[word_index], result)
        # draw out the close words as a nice dataframe panel
        display_words_in_columns(close_words)
    else:
        print("No close match found.")

# Eight Ball 

First initialize all the data - this basically loads up all the data and stuff

In [20]:
points, colors, density_scores, word_list, x, y, z = load_data("https://raw.githubusercontent.com/range-et/JokeMachine/main/Data_Vectorised_KDE.csv")

Then run the eight ball, the way to interact with it is the follwoing:
* First it asks for a word, normally a common noun is what you want to put in, or a concept
* It then finds the closest word in its dictionary that matches that word, for obvious reasons the whole dictionary of words is not available
* Then it runs the Eight-Ball machine
* It then draws the stuff out

Rerun this as many times as necessary to try out different words

In [22]:
shake_eight_ball(word_list, points, density_scores, x, y, z, iterations=100, tolerance=0.1)

The closest word is:  anarchism
The direction is:  [-0.97528551  0.16038975  0.15196478]
The close points are:  [ True False False ... False False False]


Unnamed: 0,Column 1,Column 2,Column 3,Column 4
0,anarchism,wing,modern,birth
1,thought,easy,alongside,conscience
2,live,sects,tendencies,antiauthoritarian
3,saw,federalist,sentiments,wave
4,bitter,feature,youth,individualist
...,...,...,...,...
586,hukou,motoneurons,dogmatism,aghast
587,decamp,browsable,paleoconservatives,trendsetter
588,electorally,eyetracking,spiritualize,weeding
589,shortgrass,camouflages,reservable,millstone
