In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data (run once)
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

def simulate_data(num_records=1000):
    """
    Simulates a customer feedback dataset with a mix of positive, negative, and neutral reviews.
    """
    np.random.seed(42)
    
    # Simple list of words/phrases for simulation
    positive_terms = ['great service', 'excellent product', 'highly recommend', 'fast delivery', 'love it', 'very helpful', 'amazing quality']
    negative_terms = ['slow response', 'broken item', 'terrible experience', 'unacceptable quality', 'never again', 'frustrated with delay', 'poor support']
    neutral_terms = ['no change noticed', 'it works as expected', 'standard delivery time', 'average performance', 'received the package']
    
    feedbacks = []
    sentiments = []

    for i in range(num_records):
        # Determine sentiment and associated phrase
        if i < num_records * 0.4: # 40% Positive
            sentiment = 'Positive'
            term = np.random.choice(positive_terms)
        elif i < num_records * 0.7: # 30% Negative
            sentiment = 'Negative'
            term = np.random.choice(negative_terms)
        else: # 30% Neutral
            sentiment = 'Neutral'
            term = np.random.choice(neutral_terms)

        # Add noise and complexity
        noise = np.random.choice(['!', '??', '... ', ' ', '@user ', '#tag '], p=[0.1, 0.1, 0.2, 0.5, 0.05, 0.05])
        text = f"The {term}{noise}."
        
        # Introduce a few duplicates
        if i % 100 == 0 and i > 0:
            feedbacks.append(feedbacks[-1]) # Add exact duplicate
            sentiments.append(sentiments[-1])
            
        feedbacks.append(text.lower().strip())
        sentiments.append(sentiment)

    df = pd.DataFrame({'feedback_id': range(len(feedbacks)), 'feedback_text': feedbacks, 'sentiment': sentiments})
    # Add some missing values for robust cleaning
    df.loc[df.sample(frac=0.01).index, 'feedback_text'] = np.nan
    
    return df

def clean_and_preprocess(df):
    """
    Performs data cleaning and preprocessing steps.
    """
    print(f"Initial shape: {df.shape}")
    
    # 1. Handle missing data (Dropping rows with missing feedback text)
    df.dropna(subset=['feedback_text'], inplace=True)
    print(f"Shape after handling missing values: {df.shape}")
    
    # 2. Remove duplicates
    df.drop_duplicates(subset=['feedback_text'], keep='first', inplace=True)
    print(f"Shape after removing duplicates: {df.shape}")
    
    # 3. Clean Text (Remove special characters, tags, links)
    def clean_text(text):
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        # Remove mentions and hashtags
        text = re.sub(r'@\w+|#\w+', '', text)
        # Remove special characters and numbers (keeping only letters and spaces)
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Lowercase and strip is already done in simulation, but good practice here
        return text.lower().strip()

    df['cleaned_text'] = df['feedback_text'].apply(clean_text)

    # 4. Tokenization, Stopword Removal, and Lemmatization
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    def preprocess_text(text):
        # Tokenization (split by space after cleaning)
        tokens = text.split()
        # Stopword removal
        tokens = [word for word in tokens if word not in stop_words]
        # Lemmatization
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return ' '.join(tokens)

    # Note: For BERT/Transformer models (Part 2), you often skip stopword removal/lemmatization
    # and rely on the model's tokenizer. We'll keep this step here for traditional models,
    # but for a BERT-only approach, we'd typically use 'cleaned_text' directly.
    df['processed_text'] = df['cleaned_text'].apply(preprocess_text)
    
    return df

# --- Main Execution ---
if __name__ == '__main__':
    # Step 1: Simulate data
    feedback_data = simulate_data(num_records=1100)
    print("--- Initial Data Sample ---")
    print(feedback_data.head())
    
    # Step 2: Clean and preprocess data
    cleaned_data = clean_and_preprocess(feedback_data)
    
    print("\n--- Cleaned and Processed Data Sample ---")
    print(cleaned_data[['feedback_text', 'cleaned_text', 'processed_text', 'sentiment']].sample(5))

    # Deliverable: Save the cleaned dataset
    # You would typically save the 'cleaned_text' column for BERT training
    # or 'processed_text' for LSTM/traditional models.
    cleaned_data.to_csv('cleaned_customer_feedback.csv', index=False)
    print("\nCleaned dataset saved to 'cleaned_customer_feedback.csv'.")

    # The 'cleaned_customer_feedback.csv' file is your deliverable.
