In [7]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np

import ast

In [85]:

# Load the embeddings from a CSV file
df = pd.read_csv('./../v1/BookProcessed/how-to-win-friends-and-influence-people/embeddings.csv')

df.head()

Unnamed: 0,Chapter Name,Chunk Number,Content,Content Length,Token Length,Embedding Vector
0,"If You Want To Gather Honey, Don't Kick Over T...",0,One hundred and fifty policemen and detectives...,1163,252,"[-0.030468331649899483, 0.014134373515844345, ..."
1,"If You Want To Gather Honey, Don't Kick Over T...",1,"""But a kind one - one that would do nobody any...",1355,305,"[-0.0034287304151803255, -0.004505736753344536..."
2,"If You Want To Gather Honey, Don't Kick Over T...",2,"""lighter pleasures, helping them have a good t...",311,69,"[-0.04099495708942413, -0.02334175445139408, 0..."
3,"If You Want To Gather Honey, Don't Kick Over T...",3,"said, ""I learned thirty years ago that it is f...",517,109,"[-0.016202298924326897, -0.013793670572340488,..."
4,"If You Want To Gather Honey, Don't Kick Over T...",4,"Confessed, ""I learned thirty years ago that it...",1212,242,"[-0.0013425502693280578, 0.007051051128655672,..."


In [86]:
# Convert the 'Embedding Vector' column from string to list of floats
df['Embedding Vector'] = df['Embedding Vector'].apply(ast.literal_eval)

# Create a numpy array from the lists of embeddings
embeddings = np.array(df['Embedding Vector'].tolist())

In [87]:


# Apply K-means clustering
kmeans = KMeans(n_clusters=10, random_state=0).fit(embeddings)

# Print the cluster centers
print("Cluster Centers:")
print(kmeans.cluster_centers_)

# Add the cluster labels to the original DataFrame
df['Cluster'] = kmeans.labels_

# Print the DataFrame with clusters
print(df[['Chapter Name', 'Chunk Number', 'Content', 'Cluster']])

  super()._check_params_vs_input(X, default_n_init=10)


Cluster Centers:
[[-0.00882733  0.00032976  0.02256579 ... -0.00644294 -0.00813609
  -0.02372229]
 [-0.00697661 -0.0014211   0.01167498 ... -0.01203377 -0.00408913
  -0.0218429 ]
 [-0.00192814 -0.00133758  0.01230749 ... -0.00255876 -0.00086643
  -0.0207932 ]
 ...
 [-0.0136958   0.00099934  0.02006028 ... -0.00572147  0.00356988
  -0.02607596]
 [-0.01814429 -0.00584009  0.02261521 ... -0.00095024 -0.00434578
  -0.01644942]
 [-0.01299729 -0.00298684  0.01373992 ... -0.0022837   0.00111715
  -0.01942461]]
                                          Chapter Name  Chunk Number  \
0    If You Want To Gather Honey, Don't Kick Over T...             0   
1    If You Want To Gather Honey, Don't Kick Over T...             1   
2    If You Want To Gather Honey, Don't Kick Over T...             2   
3    If You Want To Gather Honey, Don't Kick Over T...             3   
4    If You Want To Gather Honey, Don't Kick Over T...             4   
..                                                 ...     

In [88]:
# sort the dataframe by cluster number
df.sort_values(by=['Cluster'])
df.head()

Unnamed: 0,Chapter Name,Chunk Number,Content,Content Length,Token Length,Embedding Vector,Cluster
0,"If You Want To Gather Honey, Don't Kick Over T...",0,One hundred and fifty policemen and detectives...,1163,252,"[-0.030468331649899483, 0.014134373515844345, ...",4
1,"If You Want To Gather Honey, Don't Kick Over T...",1,"""But a kind one - one that would do nobody any...",1355,305,"[-0.0034287304151803255, -0.004505736753344536...",4
2,"If You Want To Gather Honey, Don't Kick Over T...",2,"""lighter pleasures, helping them have a good t...",311,69,"[-0.04099495708942413, -0.02334175445139408, 0...",4
3,"If You Want To Gather Honey, Don't Kick Over T...",3,"said, ""I learned thirty years ago that it is f...",517,109,"[-0.016202298924326897, -0.013793670572340488,...",3
4,"If You Want To Gather Honey, Don't Kick Over T...",4,"Confessed, ""I learned thirty years ago that it...",1212,242,"[-0.0013425502693280578, 0.007051051128655672,...",3


In [89]:
# going through each cluster and attempting to summarise it using gpt
import os
import openai
import json
openai.api_key ="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"

def summarize(text):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k-0613",
        messages=[{
            "role":"system",    
            "content":"You are the cofounder and voice actor for GPTBookClub, an app that helps people consume books faster by providing summaries, the goal is to make it as seamless as possible, so maintain a conversational tone, and refer the reader in the second person when needed "
        },{
            "role":"user",
            "content":" THe following are paragraphs that are clustered to be around the same key idea, your goal is to extract that key idea and summarise it in 1000 words,it will be spoken to the user so write it in a conversational tone and clearly give it a heading aka the key idea identified, your output should be in the format ## <key idea identified> # <text> the paragraphs are" + text,
        },]
     
    )
    return response['choices'][0]['message']



In [91]:
key_ideas=[]
for i in range(0,10):
    print("Cluster ", i)
    cluster_content = df[df['Cluster']==i]['Content'].values
    key_idea = summarize(cluster_content[i])
    key_ideas.append(key_idea.content)
    print("Generated key idea no. ", i)

Cluster  0
Generated key idea no.  0
Cluster  1
Generated key idea no.  1
Cluster  2
Generated key idea no.  2
Cluster  3
Generated key idea no.  3
Cluster  4
Generated key idea no.  4
Cluster  5
Generated key idea no.  5
Cluster  6
Generated key idea no.  6
Cluster  7
Generated key idea no.  7
Cluster  8
Generated key idea no.  8
Cluster  9
Generated key idea no.  9


In [93]:
# output key ideas to indiviual files
for i in range(0,10):
    print("Cluster ", i)
    # make the directory if it doesn't exist
    if not os.path.exists("./how-to-win-friends-and-influence-people"):
        os.makedirs("./how-to-win-friends-and-influence-people")
    f = open("./how-to-win-friends-and-influence-people/cluster"+str(i)+".txt", "w")
    f.write(key_ideas[i])
    # compile all key ideas into one file
    generated_key_ideas += key_ideas[i]
    f.close()
    print("")


Cluster  0

Cluster  1

Cluster  2

Cluster  3

Cluster  4

Cluster  5

Cluster  6

Cluster  7

Cluster  8

Cluster  9



In [2]:
generated_key_ideas = ""

for i in range(0,10):
    print("Cluster ", i)
    # open txt and add it to generated_key_ideas
    f = open("./how-to-win-friends-and-influence-people/cluster"+str(i)+".txt", "r")
    generated_key_ideas += f.read()
    f.close()
    print("")

Cluster  0

Cluster  1

Cluster  2

Cluster  3

Cluster  4

Cluster  5

Cluster  6

Cluster  7

Cluster  8

Cluster  9



In [6]:
# using the key ideas to generate a logical flow of the book
import openai
openai.api_key ="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"
def generate_logical_flow(text):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k-0613",
        messages=[{
            "role":"system",    
            "content":"You are the cofounder and voice actor for GPTBookClub, an app that helps people consume books faster by providing summaries, the goal is to make it as seamless as possible, so maintain a conversational tone, and refer the reader in the second person when needed "
        },{
            "role":"user",
            "content":"You have idenfitied 10 key ideas in the book HOw to win friends and influence people by Dale Carnegie, but there's no story there no logical flow. Generate the final output which are 10 key ideas in logical flow, with heading then text below, the key ideas given are as following" + text,
        },]
     
    )
    return response['choices'][0]['message']





In [7]:
print(generated_key_ideas)

## Key Idea: Understanding and Sympathy Breed Kindness

### Paragraph 1
Title: Seeing the Vulnerability in Others

Summary: The author reflects on their initial perception of the reader as a man, but upon seeing the reader in a tired and exhausted state, they realize that the reader is still a baby. The author acknowledges that they have asked too much from the reader and expresses empathy for the reader's struggles.

### Paragraph 2
Title: Understanding Others Instead of Condemning Them

Summary: The author suggests that instead of criticizing people, it is more beneficial and interesting to try to understand why they behave the way they do. This understanding leads to sympathy, tolerance, and kindness. The author references a quote that says knowing everything about someone leads to forgiveness, and points out that even God does not judge humanity until their final days.

### Principle 1: Don't criticize, condemn, or complain.

Summary: The author introduces the first principle, whic

In [8]:
print("Generating final output")
final_output = generate_logical_flow(generated_key_ideas)
print(final_output.content)
# output final output to file
f = open("./../v1/BookProcessed/the-4-hour-workweek/final_output.txt", "w")
f.write(final_output.content)

Generating final output
## Key Idea: Understanding and Sympathy Breed Kindness

### Seeing the Vulnerability in Others

The author reflects on their initial perception of the reader as a man, but upon seeing the reader in a tired and exhausted state, they realize that the reader is still a baby. The author acknowledges that they have asked too much from the reader and expresses empathy for the reader's struggles.

### Understanding Others Instead of Condemning Them

The author suggests that instead of criticizing people, it is more beneficial and interesting to try to understand why they behave the way they do. This understanding leads to sympathy, tolerance, and kindness. The author references a quote that says knowing everything about someone leads to forgiveness, and points out that even God does not judge humanity until their final days.

### Don't criticize, condemn, or complain.

The author introduces the first principle, which is to avoid criticizing, condemning, or complaining 

20542