<a href="https://colab.research.google.com/github/priyanshunayak05/NLP/blob/main/GOT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [8]:
from zipfile import ZipFile

zip_path = '/content/archive.zip'

extract_path = '/content/archive'

# Unzip
with ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("✅ Unzipped successfully to:", extract_path)

✅ Unzipped successfully to: /content/archive


In [2]:
import os
import nltk
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import plotly.express as px

In [9]:
book_files = [
    "//content/archive/1 - A Game of Thrones.txt",
    "//content/archive/2 - A Clash of Kings.txt",
    "//content/archive/3 - A Storm of Swords.txt",
    "//content/archive/4 - A Feast for Crows.txt",
    "//content/archive/5 - A Dance with Dragons.txt"
]


In [10]:
nltk.download('punkt_tab')
models={}

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [11]:
for book_path in book_files:
    assert os.path.exists(book_path), f"File not found: {book_path}"
    book_name = os.path.basename(book_path).replace(".txt", "")
    print(f"\nProcessing book: {book_name}")

    encodings_to_try = ['utf-8', 'latin-1', 'cp1252']
    book_text = None
    for encoding in encodings_to_try:
        try:
            with open(book_path, "r", encoding=encoding) as f:
                book_text = f.read()
            print(f"Successfully read with encoding: {encoding}")
            break
        except UnicodeDecodeError:
            print(f"Failed to read with encoding: {encoding}")
            continue

    if book_text is None:
        print(f"Could not read file {book_path} with any of the attempted encodings.")
        continue

    print(f"First 300 characters:\n{book_text[:300]}\n")
    print(f"Total characters in file: {len(book_text)}")

    sentences_raw = sent_tokenize(book_text)
    tokenized_lines = [simple_preprocess(line) for line in sentences_raw]

    print(f"Total sentences detected: {len(tokenized_lines)}")
    print("Example tokens:", tokenized_lines[0][:15])

    got_vec = Word2Vec(
        vector_size=100,
        window=10,
        min_count=2,
        workers=4
    )

    got_vec.build_vocab(tokenized_lines)
    got_vec.train(tokenized_lines, total_examples=got_vec.corpus_count, epochs=got_vec.epochs)

    models[book_name] = got_vec

    print(f"\nTop words similar to 'king' in {book_name}:")
    print(got_vec.wv.most_similar('king', topn=5))

    print(f"\nTop words similar to 'queen' in {book_name}:")
    print(got_vec.wv.most_similar('queen', topn=5))

    print(f"\nSimilarity between 'winter' and 'snow' in {book_name}: {got_vec.wv.similarity('winter', 'snow'):.3f}")

    print("-" * 100)


Processing book: 1 - A Game of Thrones
Successfully read with encoding: utf-8
First 300 characters:
A Game Of Thrones 
Book One of A Song of Ice and Fire 
By George R. R. Martin 
PROLOGUE 
"We should start back," Gared urged as the woods began to grow dark around them. "The wildlings are 
dead." 
"Do the dead frighten you?" Ser Waymar Royce asked with just the hint of a smile. 
Gared did not rise 

Total characters in file: 1607894
Total sentences detected: 27244
Example tokens: ['game', 'of', 'thrones', 'book', 'one', 'of', 'song', 'of', 'ice', 'and', 'fire', 'by', 'george', 'martin', 'prologue']

Top words similar to 'king' in 1 - A Game of Thrones:
[('protector', 0.8736321330070496), ('stark', 0.8620019555091858), ('warden', 0.860680878162384), ('place', 0.856772243976593), ('grace', 0.8535127639770508)]

Top words similar to 'queen' in 1 - A Game of Thrones:
[('name', 0.9891217947006226), ('daughter', 0.9870349764823914), ('visit', 0.9831856489181519), ('yes', 0.9823839664459229),

In [12]:
first_book = list(models.keys())[2]
first_model = models[first_book]

vectors = first_model.wv.get_normed_vectors()
words = first_model.wv.index_to_key
print(f"\nEmbedding shape for {first_book}: {vectors.shape}")

pca_reducer = PCA(n_components=3)
reduced_vecs = pca_reducer.fit_transform(vectors)

fig = px.scatter_3d(
    x=reduced_vecs[:500, 0],
    y=reduced_vecs[:500, 1],
    z=reduced_vecs[:500, 2],
    color=words[:500],
    title=f"3D PCA Visualization of Word Embeddings ({first_book})"
)
fig.show()


Embedding shape for 3 - A Storm of Swords: (9475, 100)


In [15]:
models["1 - A Game of Thrones"].wv.most_similar("jon")



[('catelyn', 0.9684789180755615),
 ('he', 0.9654690623283386),
 ('sansa', 0.9633117318153381),
 ('robb', 0.9582601189613342),
 ('ned', 0.9581423401832581),
 ('bran', 0.9561675786972046),
 ('arya', 0.9553037881851196),
 ('dany', 0.9374716877937317),
 ('she', 0.9165962934494019),
 ('tyrion', 0.9107663631439209)]

In [14]:
models["5 - A Dance with Dragons"].wv.most_similar("jon")

[('tyrion', 0.8901233077049255),
 ('theon', 0.8304246068000793),
 ('he', 0.8299280405044556),
 ('himself', 0.8217923045158386),
 ('davos', 0.8102646470069885),
 ('ramsay', 0.8090726137161255),
 ('boy', 0.8012819290161133),
 ('ygritte', 0.7944051623344421),
 ('reek', 0.7903745174407959),
 ('girl', 0.7759020328521729)]

**Combined**

In [17]:
import os
import nltk
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import plotly.express as px

book_files = [
    "//content/archive/1 - A Game of Thrones.txt",
    "//content/archive/2 - A Clash of Kings.txt",
    "//content/archive/3 - A Storm of Swords.txt",
    "//content/archive/4 - A Feast for Crows.txt",
    "//content/archive/5 - A Dance with Dragons.txt"
]

def read_text_safely(file_path):
    encodings_to_try = ["utf-8", "latin-1", "cp1252"]
    for enc in encodings_to_try:
        try:
            with open(file_path, "r", encoding=enc) as f:
                return f.read()
        except UnicodeDecodeError:
            print(f"Failed with encoding {enc}, trying next...")
    print(f"Could not read file {file_path} with available encodings.")
    return ""

merged_text = ""
for file_path in book_files:
    assert os.path.exists(file_path), f"File not found: {file_path}"
    book_text = read_text_safely(file_path)
    merged_text += book_text + "\n"

print("All books combined successfully!")
print("Preview:\n", merged_text[:500])
print("Total characters:", len(merged_text))

nltk.download("punkt_tab")
sent_chunks = sent_tokenize(merged_text)
token_clusters = [simple_preprocess(sent) for sent in sent_chunks]

print("Total sentences detected:", len(token_clusters))
print("Sample tokenized output:", token_clusters[0][:20])

got_all_vec = Word2Vec(
    vector_size=100,
    window=10,
    min_count=3,
    workers=4
)

got_all_vec.build_vocab(token_clusters)
got_all_vec.train(token_clusters, total_examples=got_all_vec.corpus_count, epochs=got_all_vec.epochs)

print("\nSimilar to 'king':")
print(got_all_vec.wv.most_similar('king', topn=10))

print("\nSimilar to 'queen':")
print(got_all_vec.wv.most_similar('queen', topn=10))

print("\nSimilar to 'winter':")
print(got_all_vec.wv.most_similar('winter', topn=10))

print("\nOdd one out from ['king', 'queen', 'wolf', 'dragon']:")
print(got_all_vec.wv.doesnt_match(['king', 'queen', 'wolf', 'dragon']))

print("\nSimilarity between 'winter' and 'snow':", got_all_vec.wv.similarity('winter', 'snow'))

word_vectors_all = got_all_vec.wv.get_normed_vectors()
print("Embedding matrix shape:", word_vectors_all.shape)

vocab_all = got_all_vec.wv.index_to_key
pca_tool = PCA(n_components=3)
reduced_vectors_all = pca_tool.fit_transform(word_vectors_all)

fig = px.scatter_3d(
    x=reduced_vectors_all[:500, 0],
    y=reduced_vectors_all[:500, 1],
    z=reduced_vectors_all[:500, 2],
    color=vocab_all[:500],
    title="3D PCA Visualization of GOT Word Embeddings (Safe Encodings)"
)
fig.show()


Failed with encoding utf-8, trying next...
Failed with encoding utf-8, trying next...
All books combined successfully!
Preview:
 A Game Of Thrones 
Book One of A Song of Ice and Fire 
By George R. R. Martin 
PROLOGUE 
"We should start back," Gared urged as the woods began to grow dark around them. "The wildlings are 
dead." 
"Do the dead frighten you?" Ser Waymar Royce asked with just the hint of a smile. 
Gared did not rise to the bait. He was an old man, past fifty, and he had seen the lordlings come and go. 
"Dead is dead," he said. "We have no business with the dead." 
"Are they dead?" Royce asked softly. "What proof 
Total characters: 9778338


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Total sentences detected: 145017
Sample tokenized output: ['game', 'of', 'thrones', 'book', 'one', 'of', 'song', 'of', 'ice', 'and', 'fire', 'by', 'george', 'martin', 'prologue', 'we', 'should', 'start', 'back', 'gared']

Similar to 'king':
[('baratheon', 0.7073614597320557), ('realm', 0.6656816601753235), ('prince', 0.6563164591789246), ('throne', 0.648015558719635), ('tourney', 0.6127120852470398), ('dragonstone', 0.5967557430267334), ('targaryen', 0.5965163111686707), ('council', 0.583748459815979), ('usurper', 0.5812526941299438), ('dorne', 0.5700675845146179)]

Similar to 'queen':
[('princess', 0.7732275724411011), ('margaery', 0.7234236598014832), ('daenerys', 0.7079919576644897), ('prince', 0.6794815063476562), ('myrcella', 0.6731645464897156), ('cersei', 0.6691210269927979), ('joffrey', 0.6690084934234619), ('stormborn', 0.6519742012023926), ('mother', 0.6444224119186401), ('elia', 0.6224506497383118)]

Similar to 'winter':
[('ruins', 0.7510684728622437), ('bracing', 0.74515956