In [1]:
!pip install scikit-learn
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

#Performs TF-IDF processing on preprocessed pattern data
def process_data():
    #Load preprocessed data
    df = pd.read_csv("patterns_pos.csv")
    df["filtered_keywords"] = df["filtered_keywords"].fillna("") #handle NaN values
    
    #Initialize TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer(
        max_features = 500,
        stop_words = "english",
        ngram_range = (1, 2)
    )
    
    #Apply TF-IDF to the filtered keywords (patterns_pos.csv) and produce a matrix
    tfidf_matrix = tfidf_vectorizer.fit_transform(df["filtered_keywords"])
    
    #Convert matrix to DataFrame
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf_vectorizer.get_feature_names_out())
    
    #Save DF data to new CSV file
    tfidf_df.to_csv("patterns_tfidf.csv", index=False)
    
    # Save the vectorizer and matrix to files
    with open("tfidf_vectorizer.pkl", "wb") as f:
        pickle.dump(tfidf_vectorizer, f)
    
    with open("tfidf_matrix.pkl", "wb") as f:
        pickle.dump(tfidf_matrix, f)


TF-IDF transformation of preprocessed data completed and data saved to patterns_tfidf.csv
TF-IDF vectorizer saved as tfidf_vectorizer.pkl
TF-IDF vectorizer saved as tfidf_matrix.pkl
