In [1]:
!pip install nltk
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("averaged_perceptron_tagger")
nltk.download("averaged_perceptron_tagger_eng")

#Imports raw pattern data, concatenates and applies POS tagging to the relevant fields, and exports the output to patterns_pos.csv
def preprocess_data():
    # Load the dataset
    df = pd.read_csv("patterns_raw.csv")
    
    # Fill missing values with empty strings to avoid NaN issues
    df = df.fillna("")
    
    # Concatenate structured metadata with description
    df["combined_text"] = (
        df["description"].astype(str).str.lower() + " " +
        df["craft"].astype(str).str.lower() + " " +
        df["category"].astype(str).str.lower() + " " +
        df["yarn_weight"].astype(str).str.lower() + " " +
        df["tags"].astype(str).str.lower()
    )
    
    # Remove punctuation
    df["cleaned_text"] = df["combined_text"].str.replace(r"[^\w\s]", "", regex=True)
    
    # Define function to tokenize text and apply POS tagging
    def pos_tag_text(text):
        tokens = word_tokenize(text)
        tagged_words = pos_tag(tokens)
        return tagged_words
    
    # Apply POS tagging to the combined field
    df["pos_tags"] = df["cleaned_text"].apply(pos_tag_text)
    
    # Extract only nouns and adjectives (excluding verbs and other irrelevant words)
    def extract_keywords(pos_tags):
        keywords = [word for word, tag in pos_tags if tag in ["NN", "NNS", "NNP", "NNPS", "JJ", "JJR", "JJS"]]
        return " ".join(keywords)
    
    # Create a new column with extracted keywords
    df["filtered_keywords"] = df["pos_tags"].apply(extract_keywords)
    
    # Save the processed dataset
    df.to_csv("patterns_pos.csv", index=False)


Preprocessing complete. Data saved to patterns_pos.csv.
