In [2]:
!pip install emoji



# Noise removal for textual data and remove regular expression

In [None]:
import re

def remove_noise(text):
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text.strip()

sentence = "This is a sample tweet with #hashtag and a link http://example.com!"
cleaned_text = remove_noise(sentence)

print("Original Sentence:", sentence)
print("Cleaned Sentence:", cleaned_text)

Original Sentence: This is a sample tweet with #hashtag and a link http://example.com!
Cleaned Sentence: This is a sample tweet with  and a link


# Remove emojis and emoticons from text

In [5]:
import re
import emoji

def remove_noise(text):
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove emojis
    text = emoji.replace_emoji(text, replace='')  
    
    # Remove common emoticons like :) or :-(
    text = re.sub(r'[:;=8][\-o*\']?[)D(\[]', '', text) 
    
    return text.strip()

sentence = "This is a sample tweet with #hashtag, a link http://example.com, and an emoji 😊, along with an emoticon :D."
cleaned_text = remove_noise(sentence)

print("Original Sentence:", sentence)
print("Cleaned Sentence:", cleaned_text)

Original Sentence: This is a sample tweet with #hashtag, a link http://example.com, and an emoji 😊, along with an emoticon :D.
Cleaned Sentence: This is a sample tweet with  a link  and an emoji  along with an emoticon D


# Normalize text by removing whitespaces

In [12]:
import re

def normalize_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Example usage
sentence = "  This   is   a   sample   Text  with  EXTRA   spaces.  "
normalized_text = normalize_text(sentence)

# Displaying output
print("Original Sentence:", sentence)
print("Normalized Sentence:", normalized_text)

Original Sentence:   This   is   a   sample   Text  with  EXTRA   spaces.  
Normalized Sentence: this is a sample text with extra spaces.


# Extract all dates in various formats

In [6]:
import re

def extract_dates(text):
    date_patterns = [
        r'\b\d{1,2}/\d{1,2}/\d{4}\b',  # DD/MM/YYYY
        r'\b\d{1,2}-\d{1,2}-\d{4}\b',  # MM-DD-YYYY
        r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{1,2}, \d{4}\b'  # Month Day, Year
    ]
    dates = []
    for pattern in date_patterns:
        dates.extend(re.findall(pattern, text))
    return dates

# Example usage
sentence = "The event is on 12/05/2023, another meeting is scheduled for Dec 25, 2024, and 03-15-2022 was a holiday."
dates = extract_dates(sentence)

# Displaying output
print("Original Sentence:", sentence)
print("Extracted Dates:", dates)

Original Sentence: The event is on 12/05/2023, another meeting is scheduled for Dec 25, 2024, and 03-15-2022 was a holiday.
Extracted Dates: ['12/05/2023', '03-15-2022', 'Dec 25, 2024']


# Extract phone numbers in various formats and standardize to a certain format

In [7]:
import re

def extract_phone_numbers(text):
    phone_pattern = r'\b(?:\+\d{1,2}\s?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b'
    phone_numbers = re.findall(phone_pattern, text)
    standardized_numbers = [re.sub(r'[-.\s]', '-', num) for num in phone_numbers]
    return standardized_numbers

# Example usage
sentence = "Contact me at (123) 456-7890 or +1 987-654-3210. Call 415.555.2678 for support."
phone_numbers = extract_phone_numbers(sentence)

# Displaying output
print("Original Sentence:", sentence)
print("Extracted and Standardized Phone Numbers:", phone_numbers)


Original Sentence: Contact me at (123) 456-7890 or +1 987-654-3210. Call 415.555.2678 for support.
Extracted and Standardized Phone Numbers: ['987-654-3210', '415-555-2678']
