<a href="https://colab.research.google.com/github/saikirankesoju/NLP/blob/main/NLP-22-08-25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:

import pandas as pd
import re
import spacy

# 1. Load dataset (make sure you've downloaded text-similarity dataset from Kaggle)
df = pd.read_csv("/content/train.csv")   # change to the correct file name if different

print("Columns:", df.columns)
print("\nFirst 5 rows:\n", df.head())

# Drop null rows
df = df.dropna(subset=["text"])
print("\nDataset after dropping nulls:", df.shape)

# Extract first 5 sentences
sample_texts = df["text"].head(5).tolist()
print("\nSample sentences from text column:")
for t in sample_texts:
    print("-", t)

# ========================
# Task 2: POS Tagging with spaCy
# ========================

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

print("\nPOS Tagging results:\n")
for idx, sentence in enumerate(sample_texts, 1):
    doc = nlp(sentence)
    nouns = [token.text for token in doc if token.pos_ == "NOUN"]
    verbs = [token.text for token in doc if token.pos_ == "VERB"]
    adjs  = [token.text for token in doc if token.pos_ == "ADJ"]

    print(f"Sentence {idx}: {sentence}")
    print("Nouns:", nouns)
    print("Verbs:", verbs)
    print("Adjectives:", adjs)
    print("-"*50)

# ========================
# Q2: Regex Cleaning Task
# ========================

texts = [
    "My phone number is 9059020516 and my email is saitejaginne@gmail.com",
    "Visit https://example.com for more info!!!",
    "HELLO!!! This is SOOOOO exciting :))",
    "Contact us at info@company.org or call +91 98765-43210",
    "Python's regex is very useful!!!  #Coding #Fun"
]

def clean_text(text):
    # Remove phone numbers (patterns like 10 digits, or +91 ...)
    text = re.sub(r'\+?\d[\d\-\s]{8,}\d', ' ', text)

    # Remove emails
    text = re.sub(r'\S+@\S+\.\S+', ' ', text)

    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', ' ', text)

    # Remove special characters except spaces
    text = re.sub(r'[^A-Za-z0-9\s]', ' ', text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

print("\nText Cleaning Results:\n")
for t in texts:
    print("Original:", t)
    print("Cleaned :", clean_text(t))
    print("-"*60)

Columns: Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

First 5 rows:
    id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  

Dataset after dropping nulls: (7613, 5)

Sample sentences from text column:
- Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
- Forest fire near La Ronge Sask. Canada
- All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected
- 13,000 people receive #wildfires evacuati