In [2]:
pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
Successfully installed nltk-3.9.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

#  1. Loading Data

In [5]:
df = pd.read_csv("south_korea_news_all.csv")
print(f"Loaded {len(df)} articles.")

Loaded 4254 articles.


# 2. Handle Missing Values 

In [6]:
df['headline'] = df['headline'].fillna('').astype(str)
df['snippet'] = df['snippet'].fillna('').astype(str)

# 3. Combine Text 

In [7]:
df['full_text'] = df['headline'] + " " + df['snippet']
print("Combined headline and snippet into 'full_text'.")


Combined headline and snippet into 'full_text'.


# 4. Define Cleaning Function 

In [8]:
def clean_text(text):
    text = text.lower() # Lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove URLs
    text = re.sub(r'\@\w+|\#', '', text) # Removing @mentions and #
    text = re.sub(r'[^\w\s]', '', text) # Removing punctuation
    text = re.sub(r'\d+', '', text) # Removing numbers
    return text

print("Cleaning text (lowercase, removing URLs, punctuation, numbers)...")
df['cleaned_text'] = df['full_text'].apply(clean_text)

Cleaning text (lowercase, removing URLs, punctuation, numbers)...


In [13]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /Users/user/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/user/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package omw-1.4 to /Users/user/nltk_data...


True

# 5-8. Define NLTK Processing Function

In [16]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [17]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN # Default to noun

def process_text(text):
    tokens = word_tokenize(text) # 5. Tokenize
    pos_tags = nltk.pos_tag(tokens) # 6. POS Tag
    
    processed_tokens = []
    for word, tag in pos_tags:
        # 6. Filter for Nouns (NN) and Adjectives (JJ)
        if tag.startswith('NN') or tag.startswith('JJ'):
            # 7. Lemmatize
            wn_tag = get_wordnet_pos(tag)
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            # 8. Remove Stopwords & Short Words
            if lemma not in stop_words and len(lemma) > 2:
                processed_tokens.append(lemma)
                
    return processed_tokens

print("Processing text (tokenizing, POS tagging, filtering Nouns/Adjectives, lemmatizing, removing stopwords)...")
# This step can take a few minutes for 4000+ articles
df['processed_tokens'] = df['cleaned_text'].apply(process_text)

print("\nPreprocessing Complete! Here's a sample:")
print(df[['source', 'headline', 'processed_tokens']].head())

# --- Optional: Save the Processed Data ---


Processing text (tokenizing, POS tagging, filtering Nouns/Adjectives, lemmatizing, removing stopwords)...

Preprocessing Complete! Here's a sample:
         source                                           headline  \
0  The Guardian  Trump v the world: Inside the 3 January Guardi...   
1  The Guardian  South Korea plane crash investigators turn to ...   
2  The Guardian  South Korea plane crash investigations focus o...   
3  The Guardian  ‘Sex strikes’ aren’t the feminist win they app...   
4  The Guardian  Green light: the boss of GB Railfreight with a...   

                                    processed_tokens  
0  [trump, world, january, guardian, weekly, glob...  
1  [korea, plane, crash, investigator, black, box...  
2  [korea, plane, crash, investigation, role, air...  
3  [sex, strike, feminist, radical, problem, move...  
4  [green, bos, railfreight, eye, environment, te...  


In [18]:
df.to_csv("south_korea_news_processed.csv", index=False)
print("\nSaved processed data to 'south_korea_news_processed.csv'")


Saved processed data to 'south_korea_news_processed.csv'
