In [7]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [8]:

print("Loading r/KoreanFood data...")
try:

    df_kf = pd.read_csv("korean_food_subreddit_raw.csv")
    print(f"Loaded {len(df_kf)} posts from r/KoreanFood.")
except FileNotFoundError:
    print("Error: 'korean_food_subreddit_raw.csv' not found. Did you save it?")
    exit()

Loading r/KoreanFood data...
Loaded 500 posts from r/KoreanFood.


In [9]:
# --- 2. Handle Missing Values & Ensure 'full_text' Exists ---
if 'full_text' not in df_kf.columns:
    df_kf['title'] = df_kf['title'].fillna('').astype(str)
    df_kf['selftext'] = df_kf['selftext'].fillna('').astype(str)
    df_kf['comments'] = df_kf['comments'].fillna('').astype(str)
    df_kf['full_text'] = df_kf['title'] + " " + df_kf['selftext'] + " " + df_kf['comments']
else:
    df_kf['full_text'] = df_kf['full_text'].fillna('').astype(str)
print("'full_text' column prepared.")

'full_text' column prepared.


In [10]:
# --- 3. Define Cleaning Function (Same as before) ---
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'\|', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.strip()
    return text

print("Cleaning r/KoreanFood text...")
df_kf['cleaned_text'] = df_kf['full_text'].apply(clean_text)

Cleaning r/KoreanFood text...


In [11]:
# --- 4. Define NLTK Processing Function (Using Expanded Stopwords) ---
stop_words = set(stopwords.words('english'))
custom_stopwords_layer1 = [
    'korea', 'korean', 'seoul', 'visit', 'trip', 'travel', 'tourist',
    'food', 'eat', 'restaurant', 'place', 'market', 'drink',
    'recommend', 'suggestion', 'anyone', 'know', 'thanks', 'thank',
    'help', 'advice', 'question', 'please', 'looking', 'wondering',
    'like', 'good', 'nice', 'great', 'amazing', 'delicious', 'best',
    'really', 'also', 'get', 'go', 'try', 'went', 'im', 'ive', 'im',
    'one', 'day', 'time', 'people', 'lot', 'would', 'us', 'got', 'much',
    'experience', 'english', 'sure', 'want', 'think', 'things', 'make',
    'made', 'recipe', 'cook', 'cooking' # Added some cooking words
]
custom_stopwords_layer2 = [
    'station', 'hotel', 'street', 'museum', 'myeongdong', 'area', 'park',
    'city', 'night', 'busan', 'village', 'tour', 'bus', 'hour', 'many',
    'palace', 'thing', 'map', 'cafe', 'store', 'airport', 'dinner', 'way',
    'hongdae', 'taxi', 'card', 'line', 'itinerary', 'local', 'walk', 'train',
    'jeju', 'island', 'ticket', 'need', 'plan', 'check', 'book', 'look',
    'youre', 'first', 'shopping', 'kid', 'free', 'beach', 'hanok', 'morning',
    'option', 'google', 'friend', 'traditional', 'open', 'flight', 'small',
    'temple', 'last', 'bit', 'app', 'tower', 'view', 'dont', 'car', 'stop'
]
stop_words.update(custom_stopwords_layer1)
stop_words.update(custom_stopwords_layer2)

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'): return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'): return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'): return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'): return nltk.corpus.wordnet.ADV
    else: return nltk.corpus.wordnet.NOUN

def process_text_reddit(text):
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    processed_tokens = []
    for word, tag in pos_tags:
        if tag.startswith('NN') or tag.startswith('JJ'):
            wn_tag = get_wordnet_pos(tag)
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if lemma not in stop_words and len(lemma) > 2:
                processed_tokens.append(lemma)
    return processed_tokens

print("Processing r/KoreanFood text (NLTK)...")
df_kf['processed_tokens'] = df_kf['cleaned_text'].apply(process_text_reddit)

print("\nPreprocessing Complete! Here's a sample:")
print(df_kf[['title', 'processed_tokens']].head())


try:

    df_kf.to_csv("korean_food_subreddit_processed.csv", index=False)
    print("\nSaved processed r/KoreanFood data to 'korean_food_subreddit_processed.csv'")
except Exception as e:
    print(f"\nError saving CSV: {e}")

Processing r/KoreanFood text (NLTK)...

Preprocessing Complete! Here's a sample:
                                               title  \
0  Every Korean mom has made this for their child...   
1  My girl made me dinner, she cooks me dinner on...   
2                  Beef Bulgogi and homemade banchan   
3     Gimbap for my sons first day at his new school   
4                                  How is my Kimbap?   

                                    processed_tokens  
0  [mom, child, point, parent, family, poor, pare...  
1  [girl, month, shes, meal, material, right, mea...  
2  [beef, bulgogi, homemade, banchan, god, man, j...  
3  [gimbap, son, new, school, husband, son, secon...  
4  [kimbap, ingredient, imitation, crab, egg, fis...  

Saved processed r/KoreanFood data to 'korean_food_subreddit_processed.csv'
