In [None]:
import nltk
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter

# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
import sys
import pandas as pd
import csv

# Increase field size limit
csv.field_size_limit(sys.maxsize)

# Define chunk size and sampling ratio
CHUNK_SIZE = 100000  # Read 100,000 rows at a time
SAMPLE_RATIO = 0.10  # Extract 10% of total data

sampled_chunks = []

for chunk in pd.read_csv("news.csv", usecols=["content", "type"], dtype=str, encoding="utf-8",
                         on_bad_lines="skip", low_memory=True, chunksize=CHUNK_SIZE, engine="python"):
    chunk_sample = chunk.sample(frac=SAMPLE_RATIO, random_state=42)  # Sample 10% of each chunk
    sampled_chunks.append(chunk_sample)

# Combine all sampled chunks
df_sampled = pd.concat(sampled_chunks, ignore_index=True)

print(f"Final Sampled Dataset Size: {len(df_sampled)} rows")
print(df_sampled.head())


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pandas as pd
from joblib import Parallel, delayed

# Download required resources (first time only)
nltk.download('stopwords')

# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Assuming df_sampled is already created with the 'content' and 'type' columns
df_sampled.dropna(subset=["content", "type"], inplace=True)

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize and remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    # Stem tokens
    stemmed = [stemmer.stem(word) for word in tokens]
    # Join back to text
    return ' '.join(stemmed)

# Function to apply preprocessing to a chunk of data
def process_chunk(chunk):
    chunk['processed_content'] = chunk['content'].apply(preprocess_text)
    return chunk[['content', 'type', 'processed_content']]

# Split the data into smaller chunks for parallel processing
chunk_size = 50000  # Adjust this depending on memory constraints
chunks = [df_sampled.iloc[i:i + chunk_size] for i in range(0, len(df_sampled), chunk_size)]

# Use parallel processing to preprocess chunks
processed_chunks = Parallel(n_jobs=-1)(delayed(process_chunk)(chunk) for chunk in chunks)

# Combine all processed chunks
df_processed = pd.concat(processed_chunks, ignore_index=True)

# Save the processed data to a CSV file
df_processed.to_csv("preprocessed_news.csv", index=False, encoding="utf-8")

print("Preprocessed data saved to 'preprocessed_news.csv'")
