In [3]:
import pandas as pd
import random
import numpy as np

problem_labels = [
    "Weak battery", "Worn brake pads", "Unbalanced wheels", "Loose wheel alignment",
    "Dirty throttle body", "Faulty spark plug", "Fuel pump weak",
    "AC gas leak", "Oil leak"
]

# Base complaint phrases (shared globally, causing overlap)
base_sentences = [
    "Car vibrates while driving",
    "Car takes long time to start",
    "Engine makes noise randomly",
    "Steering shakes at times",
    "Burning smell inside cabin",
    "Car hesitates when accelerating",
    "Dashboard lights keep flickering",
    "Car becomes unstable on highway",
    "Car stops suddenly",
    "Strange noise coming from engine area"
]

# Unrelated sentences (forces heavy confusion)
random_noise_sentences = [
    "I went to the market and my phone died",
    "Weather is very hot today",
    "My laptop is running slow",
    "I like to eat biryani",
    "The street dog is barking loudly",
    "My friend plays football everyday",
    "Bus was late today",
    "I bought a new mobile phone",
    "Sometimes I feel sleepy while working"
]

slang_sentences = [
    "car bro acting weird da",
    "engine oda sound mass ah varuthu",
    "bhai gadi chalate waqt hil raha hai",
    "bro car full shaking machan",
    "lol car acting sus",
    "idk car just stops da",
    "engine bhohot ajeeb sound de raha"
]

mixed_language_sentences = [
    "car start nahi ho raha properly",
    "engine la konjam smell varudhu",
    "gadi ekdam slow pick-up de raha",
    "car me kuch burning jaisa lag raha",
    "acceleration time la car jerkk agudhu"
]

words_to_add_noise = ["maybe", "idk", "pls", "randomly", "sometimes", "weird", "??", "lol", "??"]

def add_strong_noise(text):
    # Add noise words
    if random.random() < 0.40:
        text = random.choice(words_to_add_noise) + " " + text

    # jumbled words
    if random.random() < 0.40:
        words = text.split()
        random.shuffle(words)
        text = " ".join(words)

    # heavy typos
    if random.random() < 0.30:
        chars = list(text)
        for _ in range(8):
            pos = random.randint(0, len(chars) - 1)
            chars[pos] = random.choice("abcdefghijklmnopqrstuvwxyz")
        text = "".join(chars)

    return text

rows = []

for _ in range(10000):

    rand_choice = random.random()

    # 30% unrelated sentences
    if rand_choice < 0.30:
        complaint = random.choice(random_noise_sentences)

    # 20% slang
    elif rand_choice < 0.50:
        complaint = random.choice(slang_sentences)

    # 15% mixed languages
    elif rand_choice < 0.65:
        complaint = random.choice(mixed_language_sentences)

    # 25% normal base complaints
    else:
        complaint = random.choice(base_sentences)

    # 25% multi-issue (makes accuracy worse)
    if random.random() < 0.25:
        complaint += " and also " + random.choice(base_sentences)

    # Apply strong noise
    complaint = add_strong_noise(complaint)

    # 45% WRONG LABEL purposely
    if random.random() < 0.45:
        label = random.choice(problem_labels)  # wrong random
    else:
        label = random.choice(problem_labels)  # still random to avoid clear patterns

    rows.append([complaint, label])

df = pd.DataFrame(rows, columns=["Complaint", "Problem"])
df.to_csv("car_complaints_very_noise_10000.csv", index=False)
df.to_excel("car_complaints_very_noise_10000.xlsx", index=False)

print("Severely noisy dataset generated! Accuracy expected 40–60%.")
print(df.sample(10))


Severely noisy dataset generated! Accuracy expected 40–60%.
                                              Complaint            Problem
8337  flocyering Dashbbarc alao stops keep suddenly ...    Worn brake pads
6086                            car bro acting weird da           Oil leak
3357                     idk Burning smell inside cabin  Unbalanced wheels
4849  randomly I like tooeat biryani anddalsouCae he...       Weak battery
9599                       lol Steering shakes at times  Faulty spark plug
7435              noise area coming Strange from engine           Oil leak
1004                 ?? unojwpick-up gtdi ekdaj bm raha  Unbalanced wheels
7416  sometimes acceleration time la car jerkk agudh...     Fuel pump weak
8575                              ?? lol car acting sus       Weak battery
4583            engine varudhu randomly la smell konjam        AC gas leak
