In [1]:
import nltk
nltk.download('punkt')




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akqp4\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [2]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()

text = "Why are viruses different from other microbes?"
tokens = tokenizer.tokenize(text.lower())
print(tokens)


['why', 'are', 'viruses', 'different', 'from', 'other', 'microbes', '?']


In [9]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import TreebankWordTokenizer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources (do this once)
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')  # New English-specific tagger
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tokenizer = TreebankWordTokenizer()  # Using tokenizer that does NOT require 'punkt'

# Convert POS tags to WordNet format
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Preprocessing function using lemmatization and TreebankWordTokenizer
def preprocess_text(text):
    try:
        if pd.isna(text):
            return ""
        
        # 1. Lowercase
        text = text.lower()

        # 2. Remove non-alphabetic characters (keep spaces)
        text = re.sub(r'[^a-z\s]', '', text)

        # 3. Tokenize using TreebankWordTokenizer (no punkt required)
        tokens = tokenizer.tokenize(text)

        # 4. Remove stopwords
        tokens = [word for word in tokens if word not in stop_words]

        # 5. POS tagging
        pos_tags = pos_tag(tokens)

        # 6. Lemmatize with POS
        lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]

        # Return clean, lemmatized text string
        return ' '.join(lemmatized)

    except Exception as e:
        print(f"Error processing text: {text}")
        print(f"Exception: {e}")
        return ""

# Load dataset
file_path = "science_qa.csv"  # Update this path if needed
df = pd.read_csv(file_path)

print(f"Original dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"First few rows of original data:")
print(df.head())
print("\n")

# Check if columns exist (case-insensitive)
available_cols = df.columns.tolist()
question_col = None
answer_col = None

for col in available_cols:
    if col.lower() == 'question':
        question_col = col
    elif col.lower() == 'answer':
        answer_col = col

if not question_col or not answer_col:
    print(f"ERROR: Could not find 'question' and 'answer' columns!")
    print(f"Available columns: {available_cols}")
    exit()

print(f"Using columns: '{question_col}' and '{answer_col}'\n")

# Keep only Question and Answer columns (skip timestamp)
columns_to_process = [question_col, answer_col]
df_processed = df[columns_to_process].copy()

print("Starting preprocessing...\n")
print(f"Total rows to process: {len(df_processed)}\n")

# Track errors
error_count = 0
success_count = 0

# Preprocess each column
for col in columns_to_process:
    print(f"Processing column: {col}")
    processed_values = []
    
    for idx, value in enumerate(df_processed[col]):
        try:
            result = preprocess_text(value)
            processed_values.append(result)
            success_count += 1
            
            # Show progress every 100 rows
            if (idx + 1) % 100 == 0:
                print(f"  Processed {idx + 1}/{len(df_processed)} rows...")
        except Exception as e:
            print(f"  Error at row {idx}: {e}")
            processed_values.append("")
            error_count += 1
    
    df_processed[col] = processed_values
    print(f"✓ Completed {col}")
    print(f"  Success: {success_count}, Errors: {error_count}\n")
    
    # Reset counters for next column
    success_count = 0
    error_count = 0

# Save preprocessed dataset
output_path = "preprocessed_science_qa.csv"
df_processed.to_csv(output_path, index=False)

print("=" * 60)
print("Preprocessing completed successfully!")
print("=" * 60)
print(f"\nOutput saved to: {output_path}")
print(f"Total rows processed: {len(df_processed)}")
print(f"Original CSV had: {len(df)} rows")
print(f"\nFirst 10 rows of preprocessed data:")
print(df_processed.head(10))
print(f"\nLast 5 rows of preprocessed data:")
print(df_processed.tail(5))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akqp4\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\akqp4\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\akqp4\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\akqp4\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\akqp4\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Original dataset shape: (980, 2)
Columns: ['Question', 'Answer']
First few rows of original data:
                                            Question  \
0      What is the Heisenberg Uncertainty Principle?   
1  What is the chemical formula for rust?",Fe₂O₃ ...   
2  What is the name of the galaxy our solar syste...   
3                           What causes ocean tides?   
4                                What is a catalyst?   

                                              Answer  
0  It states that the position and momentum of a ...  
1  To generate energy (ATP) through cellular resp...  
2                              The Milky Way galaxy.  
3  Primarily the gravitational pull of the Moon a...  
4  A substance that speeds up a chemical reaction...  


Using columns: 'Question' and 'Answer'

Starting preprocessing...

Total rows to process: 980

Processing column: Question
  Processed 100/980 rows...
  Processed 200/980 rows...
  Processed 300/980 rows...
  Processed 400/980 rows..