In [None]:
import pandas as pd
import re
import glob

# Load all conversations
cleaned_dfs = []
seen_conversations = set()


# Combined list of phrases indicating commercial intent
commercial_intent_phrases = [
    "buy", "purchase", "order", "checkout", "sell", "trade", "offer", "listing", "price",
    "show me", "looking for", "searching for", "recommend product", "need a service",
    "looking for service", "find service", "require service", "brand name", "store",
    "official store", "authorized retailer", "nearest store", "find store", "location of store",
    "shop nearby", "issue", "problem", "solution"
]

#  Function to Filter conversations based on commercial intent
def has_commercial_intent(conversation):
    for phrase in commercial_intent_phrases:
        if phrase in conversation.lower():
            return True
    return False

#Load files and filter by language and country and commercial intent
for data_file in glob.glob('Data/*'):
    df = pd.read_parquet(data_file)
    df_convo = df[['conversation','country']]
    df_convo = df_convo[df_convo['conversation'].apply(lambda x: x[0]['language'] == 'English')]
    df_convo = df_convo[df_convo['country']=='United States']
    df_convo = df_convo[df_convo['conversation'].apply(lambda x: has_commercial_intent(x[0].get('content', '')))]


#Drop duplicates
    # Function to extract the first 10 words of a conversation after "content:"
    def get_first_10_words(conversation):
        content = conversation[0].get('content', '')
        # Extract the actual conversation starting after 'content:'
        start_index = content.find('content:')
        if start_index != -1:
            content = content[start_index + len('content:'):].strip()
        first_10_words = ' '.join(content.split()[:10])
        return first_10_words

    # Filter out conversations where the first 10 words are duplicates
    def filter_duplicate_conversations(conversation):
        first_10_words = get_first_10_words(conversation)
        if first_10_words in seen_conversations:
            return False
        seen_conversations.add(first_10_words)
        return True

    # Apply the function to filter duplicate conversations
    df_convo = df_convo[df_convo['conversation'].apply(filter_duplicate_conversations)]

    # Drop exact duplicate conversations (if any remain)
    df_convo = df_convo.drop_duplicates(subset=['conversation'])

#only return the part of the conversation after content':' before ', 'country
    df_convo['conversation'] = df_convo['conversation'].apply(lambda x: x[0].get('content', ''))

    #Use new variable to store the clean data
    cleaned_df = df_convo['conversation']
    cleaned_dfs.append(cleaned_df)

    # List of words to filter out
    filter_words = [
        "story", "fiction", "hypothetical", "roleplay", "role play", "pokemon", "drama", "game",
        "seraphina", "natsuki", "kai", "sex", "personal account", "essay", "poem", "author",
        "compose", "once upon a time", "fanfic", "fan fic", "python", "c++", "assembly language",
        "sql", "html", "monika", "yuri", "sayori", "gnome", "luna", "undine", "wisp", "imagine",
        "backstory", "scenes", "dragon", "parody", "scene", "chapter", "drippler girl",
        "storywriting", "gabriel", "teleport", "[player]", "episode", "c program", "imoprt java",
        "import pandas", "following code", "this code", "query select", "write a", "tale",
        "folktale", "folk", "proofread", "kingdom", "warlord"
    ]

    # Function to check if a conversation contains any of the filter words
    def contains_filter_words(conversation):
        for word in filter_words:
            if re.search(r'\b' + re.escape(word) + r'\b', conversation, re.IGNORECASE):
                return True
        return False

    # Apply the filter to remove conversations containing any of the filter words
    cleaned_dfs = [df[~df.apply(contains_filter_words)] for df in cleaned_dfs]

english_convo = pd.concat(cleaned_dfs)
print(english_convo)

In [None]:
#Save the ouput to a csv file
english_convo.to_csv('CleanWildChat.csv', index=False)