<a href="https://colab.research.google.com/github/sheemapatel/nlp--/blob/main/22_8_25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import spacy
import re
import numpy as np

data = {
    "text_1": [
        "Japan economy slides to recession. The Japanese economy has officially gone back into recession for the first time since 2001.",
        "Optimism remains over UK housing. The UK property market remains robust despite the recent slowdown.",
        "Lufthansa flies back to profit. German airline Lufthansa has returned to profit in 2004 after posting a loss in 2003.",
        "The quick brown fox jumps over the lazy dog.",
        "A study found that 10% of users never log in.",
        np.nan,
        "This is another example text for cleaning."
    ],
    "text_2": [
        "Japan economy slides to recession the japanese economy has officially gone back into recession for t",
        "The UK property market remains robust despite the recent slowdown.",
        "German airline Lufthansa has returned to profit in 2004 after posting",
        "The fast brown fox leaps over the lethargic canine.",
        "10% of users never log in, a study found.",
        "A paired text.",
        "More text."
    ]
}
df = pd.DataFrame(data)

print("### First 5 entries of 'text_1' (Before Cleaning) ###")
print(df['text_1'].head().to_string(index=False))

print("\n### Checking for Null Values ###")
print(df.isnull().sum())

df_cleaned = df.dropna()
print(f"\nDataFrame size before cleaning: {len(df)}")
print(f"DataFrame size after cleaning: {len(df_cleaned)}")

pos_sentences = df_cleaned['text_1'].head(5).tolist()

### First 5 entries of 'text_1' (Before Cleaning) ###
Japan economy slides to recession. The Japanese...
Optimism remains over UK housing. The UK proper...
Lufthansa flies back to profit. German airline ...
      The quick brown fox jumps over the lazy dog.
     A study found that 10% of users never log in.

### Checking for Null Values ###
text_1    1
text_2    0
dtype: int64

DataFrame size before cleaning: 7
DataFrame size after cleaning: 6


In [None]:
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:

    nlp = spacy.blank("en")


all_nouns = set()
all_verbs = set()
all_adjectives = set()

for i, sentence in enumerate(pos_sentences):
    doc = nlp(sentence)
    print(f"\n--- Sentence {i+1} ---: **{sentence}**")

    print("{:<15} {:<10} {:<10}".format("Token", "POS", "Dependency"))
    print("-" * 35)
    for token in doc:
        print("{:<15} {:<10} {:<10}".format(token.text, token.pos_, token.dep_))

        if token.pos_ == "NOUN" or token.pos_ == "PROPN":
            all_nouns.add(token.text)
        elif token.pos_ == "VERB":
            all_verbs.add(token.text)
        elif token.pos_ == "ADJ":
            all_adjectives.add(token.text)

print("\n### Extracted Parts of Speech ###")
print(f"**Nouns**: {sorted(list(all_nouns))}")
print(f"**Verbs**: {sorted(list(all_verbs))}")
print(f"**Adjectives**: {sorted(list(all_adjectives))}")


--- Sentence 1 ---: **Japan economy slides to recession. The Japanese economy has officially gone back into recession for the first time since 2001.**
Token           POS        Dependency
-----------------------------------
Japan           PROPN      compound  
economy         NOUN       compound  
slides          NOUN       ROOT      
to              ADP        prep      
recession       NOUN       pobj      
.               PUNCT      punct     
The             DET        det       
Japanese        ADJ        amod      
economy         NOUN       nsubj     
has             AUX        aux       
officially      ADV        advmod    
gone            VERB       ROOT      
back            ADV        advmod    
into            ADP        prep      
recession       NOUN       pobj      
for             ADP        prep      
the             DET        det       
first           ADJ        amod      
time            NOUN       pobj      
since           SCONJ      prep      
2001          

In [None]:
import re

q2_sentences = [
    "My phone number is 1234567890 and my email is test@domain.com",
    "Visit https://example.com for more info!!!",
    "HELLO!!! This is SOOOOO exciting :))",
    "Contact us at info@company.org or call +91 98765-43210",
    "Python's regex is very useful!!! #Coding #Fun"
]


for i, text in enumerate(q2_sentences):
    print(f"\n--- Text {i+1}: '{text}' ---")

    phone_pattern = re.compile(r"(\+?\d{1,3}[-. ]?)?\(?\d{3}\)?[-. ]?\d{3}[-. ]?\d{4,}")

    found_phones = phone_pattern.findall(text)
    print(f"**Found Phone Numbers**: {found_phones}")

    email_pattern = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b")
    found_emails = email_pattern.findall(text)
    print(f"**Found Emails**: {found_emails}")

    url_pattern = re.compile(r"https?://\S+|www\.\S+")
    found_urls = url_pattern.findall(text)
    print(f"**Found URLs**: {found_urls}")

    cleaned_text = phone_pattern.sub("", text)
    cleaned_text = email_pattern.sub("", cleaned_text)
    cleaned_text = url_pattern.sub("", cleaned_text)

    final_cleaned_text = re.sub(r"[^\w\s]", "", cleaned_text)

    print(f"**Text After Removal**: {final_cleaned_text.strip()}")


--- Text 1: 'My phone number is 1234567890 and my email is test@domain.com' ---
**Found Phone Numbers**: ['']
**Found Emails**: ['test@domain.com']
**Found URLs**: []
**Text After Removal**: My phone number is  and my email is

--- Text 2: 'Visit https://example.com for more info!!!' ---
**Found Phone Numbers**: []
**Found Emails**: []
**Found URLs**: ['https://example.com']
**Text After Removal**: Visit  for more info

--- Text 3: 'HELLO!!! This is SOOOOO exciting :))' ---
**Found Phone Numbers**: []
**Found Emails**: []
**Found URLs**: []
**Text After Removal**: HELLO This is SOOOOO exciting

--- Text 4: 'Contact us at info@company.org or call +91 98765-43210' ---
**Found Phone Numbers**: []
**Found Emails**: ['info@company.org']
**Found URLs**: []
**Text After Removal**: Contact us at  or call 91 9876543210

--- Text 5: 'Python's regex is very useful!!! #Coding #Fun' ---
**Found Phone Numbers**: []
**Found Emails**: []
**Found URLs**: []
**Text After Removal**: Pythons regex is ver