In [None]:
import langchain
import nltk
import os
from nltk.tokenize import word_tokenize
import pandas as pd

In [None]:
# Create the directory and make sure it's in the path BEFORE downloading
os.makedirs("./nltk_data", exist_ok=True)
nltk.data.path.insert(0, "./nltk_data")  # Add to the beginning of the search path

In [10]:
# Download punkt and verify success
download_result = nltk.download('punkt', download_dir="./nltk_data")
print(f"Download successful: {download_result}")

Download successful: True


[nltk_data] Downloading package punkt to ./nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# List available data to verify punkt is there
print(f"NLTK data path: {nltk.data.path}")
try:
    print(f"Available tokenizers: {os.listdir('./nltk_data/tokenizers')}")
except Exception as e:
    print(f"Error checking tokenizers: {e}")

NLTK data path: ['./nltk_data', './nltk_data', '/home/codespace/nltk_data', '/usr/local/python/3.12.1/nltk_data', '/usr/local/python/3.12.1/share/nltk_data', '/usr/local/python/3.12.1/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data', './nltk_data', './nltk_data']
Available tokenizers: ['punkt.zip', 'punkt']


In [12]:
# Try a simple tokenization to verify it works
try:
    test_tokens = word_tokenize("This is a test.")
    print(f"Test tokenization: {test_tokens}")
except Exception as e:
    print(f"Tokenization error: {e}")

Tokenization error: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - './nltk_data'
    - './nltk_data'
    - '/home/codespace/nltk_data'
    - '/usr/local/python/3.12.1/nltk_data'
    - '/usr/local/python/3.12.1/share/nltk_data'
    - '/usr/local/python/3.12.1/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - './nltk_data'
    - './nltk_data'
**********************************************************************



In [13]:
# Now continue with your code
corpus = [
    "best view comes after the hardest climb",
    "it always seems impossible until it's done",
]

In [14]:
def get_bow_representation(corpus, frequency=True):
    # Simplify vocabulary creation to avoid potential issues
    vocabulary = set()
    for sentence in corpus:
        words = sentence.lower().split()
        vocabulary.update(words)
    
    print(f"Vocabulary: {vocabulary}")
    
    bow_rep = []
    for sentence in corpus:
        sentence_rep = {v: 0 for v in vocabulary}
        # Use simple split as a fallback if tokenization fails
        try:
            words = word_tokenize(sentence.lower())
            print(f"Tokenized: {words}")
        except:
            words = sentence.lower().split()
            print(f"Fallback split: {words}")
            
        for word in words:
            if word in vocabulary:
                if frequency:
                    sentence_rep[word] += 1
                else:
                    sentence_rep[word] = 1
        bow_rep.append(sentence_rep)
    return bow_rep

In [None]:
# Try your function
bow_representation = get_bow_representation(corpus, True)
df = pd.DataFrame(bow_representation)
df.index = corpus
print(df)  # Use print instead of display for debugging

Vocabulary: {'climb', 'hardest', 'until', 'impossible', 'best', "it's", 'done', 'view', 'always', 'after', 'it', 'the', 'seems', 'comes'}
Fallback split: ['best', 'view', 'comes', 'after', 'the', 'hardest', 'climb']
Fallback split: ['it', 'always', 'seems', 'impossible', 'until', "it's", 'done']
                                            climb  hardest  until  impossible  \
best view comes after the hardest climb         1        1      0           0   
it always seems impossible until it's done      0        0      1           1   

                                            best  it's  done  view  always  \
best view comes after the hardest climb        1     0     0     1       0   
it always seems impossible until it's done     0     1     1     0       1   

                                            after  it  the  seems  comes  
best view comes after the hardest climb         1   0    1      0      1  
it always seems impossible until it's done      0   1    0      1      0  