<a href="https://colab.research.google.com/github/srikanthpurimitla/Python-/blob/main/IPLNlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://colab.research.google.com/github/srikanthpurimitla/Python-/blob/main/NLP_usecase.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [None]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Sample IPL news text
text = """
The Indian Premier League (IPL) 2024 season is set to start in April.
Chennai Super Kings will face Mumbai Indians in the opening match at Wankhede Stadium.
Rohit scored a magnificent century last season. Stay tuned for live updates!
Visit https://www.iplt20.com for more details. Contact us at info@ipl.com.
"""

In [None]:
# Initialize necessary tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


In [None]:

def preprocess_text(text):
    print("Original Text:\n", text)

    # Convert to lowercase
    text = text.lower()
    print("\nLowercased Text:\n", text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    print("\nText without URLs:\n", text)
    #\S --> white space
    # Remove email addresses
    text = re.sub(r'\S*@\S*\s?', '', text)
    print("\nText without Email Addresses:\n", text)
    # \S* matches zero or more non-whitespace characters
    #\s? matches zero or one whitespace character
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    print("\nText without Punctuation:\n", text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)
    print("\nText without Numbers:\n", text)
    #\d+ matches sequences of one or more digits
    # Remove special characters
    text = re.sub(r'[^A-Za-z\s]', '', text)
    print("\nText without Special Characters:\n", text)
    #[^A-Za-z\s] matches any character that is not an uppercase letter, lowercase letter, or whitespace character
    # Tokenize into words
    tokens = word_tokenize(text)
    print("\nTokenized Text:\n", tokens)

    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    print("\nText without Stop Words:\n", tokens)

    # Stemming (or Lemmatization)
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    print("\nStemmed Tokens:\n", stemmed_tokens)

    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    print("\nLemmatized Tokens:\n", lemmatized_tokens)

    # Join tokens back to string
    processed_text = ' '.join(stemmed_tokens)
    print("\nProcessed Text:\n", processed_text)

    return processed_text

In [None]:
# Preprocess the sample text
processed_text = preprocess_text(text)

Original Text:
 
The Indian Premier League (IPL) 2024 season is set to start in April.
Chennai Super Kings will face Mumbai Indians in the opening match at Wankhede Stadium.
Rohit scored a magnificent century last season. Stay tuned for live updates!
Visit https://www.iplt20.com for more details. Contact us at info@ipl.com.


Lowercased Text:
 
the indian premier league (ipl) 2024 season is set to start in april.
chennai super kings will face mumbai indians in the opening match at wankhede stadium.
rohit scored a magnificent century last season. stay tuned for live updates!
visit https://www.iplt20.com for more details. contact us at info@ipl.com.


Text without URLs:
 
the indian premier league (ipl) 2024 season is set to start in april.
chennai super kings will face mumbai indians in the opening match at wankhede stadium.
rohit scored a magnificent century last season. stay tuned for live updates!
visit  for more details. contact us at info@ipl.com.


Text without Email Addresses:
 


In [None]:
#RegEX
import re
import nltk
from nltk.tokenize import sent_tokenize

# Download required NLTK data
nltk.download('punkt')

# Sample IPL news text
text = """
The IPL 2024 season is set to begin with a match between Mumbai Indians and Chennai Super Kings.
Contact the organizers at +91 8096696726 for more details.
For media inquiries, reach out at media@example.com or visit our website at https://www.iplt20.com.
In other news, Delhi Capitals announced their new captain.
Follow the latest updates on our official site http://iplupdates.com or call us at 9573471012.
"""

# Define regex patterns
phone_pattern = re.compile(r'(?:\+91|91)?[-.\s]?[6789]\d{9}')
email_pattern = re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')
url_pattern = re.compile(r'https?://[^\s]+')

# Extract  phone numbers
phone_numbers = phone_pattern.findall(text)
print(" Phone Numbers:", phone_numbers)

# Extract email addresses
email_addresses = email_pattern.findall(text)
print("Email Addresses:", email_addresses)

# Extract URLs
urls = url_pattern.findall(text)
print("URLs:", urls)

# Mask Indian phone numbers
masked_text = phone_pattern.sub('[PHONE NUMBER]', text)

# Mask email addresses
masked_text = email_pattern.sub('[EMAIL ADDRESS]', masked_text)

# Mask URLs
masked_text = url_pattern.sub('[URL]', masked_text)

# Print masked text
print("\nMasked Text:\n", masked_text)

# Split text into sentences
sentence_pattern = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s')
sentences = sentence_pattern.split(text)
print("\nSentences:", sentences)


 Phone Numbers: ['+91 8096696726', ' 9573471012']
Email Addresses: ['media@example.com']
URLs: ['https://www.iplt20.com.', 'http://iplupdates.com']

Masked Text:
 
The IPL 2024 season is set to begin with a match between Mumbai Indians and Chennai Super Kings.
Contact the organizers at [PHONE NUMBER] for more details.
For media inquiries, reach out at [EMAIL ADDRESS] or visit our website at [URL]
In other news, Delhi Capitals announced their new captain.
Follow the latest updates on our official site [URL] or call us at[PHONE NUMBER].


Sentences: ['\nThe IPL 2024 season is set to begin with a match between Mumbai Indians and Chennai Super Kings.', 'Contact the organizers at +91 8096696726 for more details.', 'For media inquiries, reach out at media@example.com or visit our website at https://www.iplt20.com.', 'In other news, Delhi Capitals announced their new captain.', 'Follow the latest updates on our official site http://iplupdates.com or call us at 9573471012.', '']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
