In [4]:
import warnings
warnings.filterwarnings('ignore')

# Dataset Creation

In [5]:
import pandas as pd
import random

# Number of samples
n_samples = 1000

# Generate random ticket IDs
ticket_ids = range(10001, 10001 + n_samples)

# Dummy customer texts (just for illustration, in a real-world scenario these will be more diverse and meaningful)
customer_texts = [
    "I can't log into my account.",
    "The application is crashing frequently.",
    "How do I reset my password?",
    "The system is too slow.",
    "Data is not syncing across devices.",
    "I received a wrong bill.",
    "The UI is not user-friendly.",
    "My payment got declined.",
    "I am facing issues with the new update.",
    "How do I backup my data?"
]

# Randomly assign customer texts to ticket IDs
texts = [random.choice(customer_texts) for _ in range(n_samples)]

# Severity levels
severities = ["low", "medium", "high", "critical"]

# Randomly assign severity levels to ticket IDs
severity_labels = [random.choice(severities) for _ in range(n_samples)]

# Create a DataFrame
df = pd.DataFrame({
    "ticket_id": ticket_ids,
    "customer_text": texts,
    "severity": severity_labels
})

df.head()


Unnamed: 0,ticket_id,customer_text,severity
0,10001,My payment got declined.,low
1,10002,My payment got declined.,critical
2,10003,The system is too slow.,critical
3,10004,How do I backup my data?,critical
4,10005,My payment got declined.,high


# Text cleaning or Text Pre-processing

In [6]:
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from langdetect import detect

# Download necessary resources for NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # 2. Convert to lowercase
    text = text.lower()
    
    # 7. Remove HTML tags
    clean = re.compile('<.*?>')
    text = re.sub(clean, '', text)
    
    # 9. Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # 8. Remove special characters (excluding space)
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 10. Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # 3. Tokenize
    tokens = word_tokenize(text)
    
    # 1. Stopword removal
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # 4. Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # 5. Lemmatizing
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # 11. Remove extra whitespaces
    text = " ".join(tokens)
    
    # 12. Sentence segmentation
    sentences = sent_tokenize(text)
    
    # 13. Language detection (just an example, you might need a more robust solution)
    lang = detect(text)
    
    # 14. Code switching detection (not implementing as it's quite involved and requires additional tools)
    # 15. N-gram generation (example for bigrams; you can adjust for other n-grams)
    bigrams = list(nltk.bigrams(tokens))
    
    # 16. Noise reduction (not implementing as it's specific to the dataset and might need manual rules)
    
    # Rejoining cleaned tokens to form the cleaned text
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text, lang, bigrams

# Example:
text = "Your sample text with <html> tags </html> and https://example.com URLs."
cleaned_text, detected_lang, generated_bigrams = preprocess_text(text)

print(f"Cleaned Text: {cleaned_text}")
print(f"Detected Language: {detected_lang}")
print(f"Generated Bigrams: {generated_bigrams}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator.DESKTOP-
[nltk_data]     QSF3VEN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator.DESKTOP-
[nltk_data]     QSF3VEN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator.DESKTOP-
[nltk_data]     QSF3VEN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Cleaned Text: sampl text tag url
Detected Language: sv
Generated Bigrams: [('sampl', 'text'), ('text', 'tag'), ('tag', 'url')]


In [7]:
# Assuming the function preprocess_text is already defined as above and df is our dataframe

# Applying the function to our dataframe's "complaints" column
df['customer_text'] = df['customer_text'].apply(lambda x: preprocess_text(x)[0])

# If you're interested in capturing detected language and bigrams as well:
df['detected_lang'] = df['customer_text'].apply(lambda x: preprocess_text(x)[1])
df['bigrams'] = df['customer_text'].apply(lambda x: preprocess_text(x)[2])

df.head()

Unnamed: 0,ticket_id,customer_text,severity,detected_lang,bigrams
0,10001,payment got declin,low,fr,"[(payment, got), (got, declin)]"
1,10002,payment got declin,critical,fr,"[(payment, got), (got, declin)]"
2,10003,system slow,critical,cs,"[(system, slow)]"
3,10004,backup data,critical,id,"[(backup, data)]"
4,10005,payment got declin,high,fr,"[(payment, got), (got, declin)]"


# EDA for Supervised Text Classification