In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import re
import csv

# Sample inputs
examples = [
    "Hey, can u plz tell me where’s my order??",
    "i didn’t receive my parcel yet!!!!",
    "Whr’s my ordr 😡😡",
    "delivery late af... i want refund now"
]

# Slang dictionary
slang_dict = {
    'u': 'you',
    'plz': 'please',
    'pls': 'please',
    'whr': 'where',
    'ordr': 'order',
    'af': 'as f***',
    'idk': 'i don’t know',
    'btw': 'by the way',
    'recvd': 'received',
    'msg': 'message'
}

# Keywords that indicate frustration
emotion_keywords = ['refund', 'late', 'angry', 'mad', 'not happy', 'never arrived']

# Emoji substitution using simple mapping
def substitute_emojis(text):
    emoji_replacements = {
        "😡": "<angry_face>",
        "😢": "<sad_face>",
        "😂": "<laugh_face>",
        "❤️": "<heart>"
    }
    for emo, tag in emoji_replacements.items():
        text = text.replace(emo, f" {tag} ")
    return text

# Normalize elongated words like "sooo" to "soo"
def reduce_elongation(word):
    return re.sub(r'(.)\1{2,}', r'\1\1', word)

# Replace repeated punctuation with tags
def handle_repeated_punctuation(text):
    text = re.sub(r'!{2,}', ' <emphasis> ', text)
    text = re.sub(r'\?{2,}', ' <confusion> ', text)
    return text

# Full preprocessing pipeline
def normalize_text(text):
    text = text.lower()
    text = substitute_emojis(text)
    text = handle_repeated_punctuation(text)
    text = re.sub(r'[^\w\s<>]', '', text)  # Remove all except tags and words

    # Tokenize and expand slang
    words = text.split()
    normalized_words = [slang_dict.get(reduce_elongation(word), word) for word in words]
    text = ' '.join(normalized_words)

    # Tag frustration if keywords present
    if any(keyword in text for keyword in emotion_keywords):
        text += ' <frustrated>'
    
    return text.strip()

# Preprocess and store original vs cleaned versions
processed_data = [(ex, normalize_text(ex)) for ex in examples]

# Save as CSV
csv_path = "processed_user_inputs.csv"
with open(csv_path, "w", newline='', encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Original", "Processed"])
    writer.writerows(processed_data)

# Save as TXT
txt_path = "processed_user_inputs.txt"
with open(txt_path, "w", encoding="utf-8") as f:
    for i, (original, processed) in enumerate(processed_data, 1):
        f.write(f"Example {i}:\n")
        f.write(f"Original : {original}\n")
        f.write(f"Processed: {processed}\n\n")

print("✅ Files saved:")
print(f"- {csv_path}")
print(f"- {txt_path}")


✅ Files saved:
- processed_user_inputs.csv
- processed_user_inputs.txt
