In [2]:

import nltk
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Step 1: Lowercase Conversion
def lower_case_text(text):
    return text.lower()

# Step 2: Remove Punctuations using regex
def remove_punctuations(text):
    return re.sub(r'[^\w\s]', '', text)

# Step 3: Tokenization
def tokenize(text):
    return word_tokenize(text)

# Step 4: Stopword Removal
stop_words = set(stopwords.words('english'))
def remove_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]

# Step 5: Preprocessing Pipeline
def pre_process_pipeline(text):
    text = lower_case_text(text)
    text = remove_punctuations(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    return tokens

# Example Usage
sample_text = "Hello!! he said, and went..: (alphonso) mango IS My favorite FruiT in whole World!"
processed_tokens = pre_process_pipeline(sample_text)
print("Processed Tokens:", processed_tokens)

# Step 6: DataFrame Filtering
data = {
    'Customer_ID': [101, 102, 103, 104, 105],
    'City': ['Mumbai', 'Pune', 'Mumbai', 'Delhi', 'Bangalore']
}
df = pd.DataFrame(data)
filtered_df = df[df['City'] == 'Mumbai']
print("\nFiltered Data (City == Mumbai):")
print(filtered_df)

# Step 7: Script Validation
def validate_script(script, word):
    return "Valid" if word.lower() in script.lower() else "Not Valid"

# Taking user input
script_input = input("\nEnter the script: ")
word_input = input("Enter the word to check: ")

result = validate_script(script_input, word_input)
print("Validation Result:", result)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Processed Tokens: ['hello', 'said', 'went', 'alphonso', 'mango', 'favorite', 'fruit', 'whole', 'world']

Filtered Data (City == Mumbai):
   Customer_ID    City
0          101  Mumbai
2          103  Mumbai
Validation Result: Valid
