# NLP Assignment
This notebook provides hands-on practice with text processing techniques in Python, including tokenization, lemmatization, and stemming. It also includes loading, analyzing, and scraping textual data from different sources.

## Setup
1. Install a Python programming environment (e.g., PyCharm, Jupyter Notebook).
2. Install the necessary Python libraries: `nltk`, `spaCy`, `BeautifulSoup`.

```python
!pip install nltk spacy beautifulsoup4

import nltk
nltk.download('punkt')
nltk.download('wordnet')

import spacy
spacy.cli.download('en_core_web_sm')
```

## Data Loading & Basic Analysis
First, let's load the dataset and print the required basic statistics.

In [None]:
# Import necessary libraries
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import spacy

# Load the spam.csv dataset
file_path = 'spam.csv'  # Replace with your actual file path
spam_df = pd.read_csv(file_path, encoding='latin-1')

# Drop unnecessary columns
spam_df = spam_df[['v1', 'v2']]
spam_df.columns = ['label', 'message']

# Total number of SMS messages
total_messages = len(spam_df)

# Number of spam/ham messages
spam_messages = len(spam_df[spam_df['label'] == 'spam'])
ham_messages = len(spam_df[spam_df['label'] == 'ham'])

# Average number of words per message
spam_df['word_count'] = spam_df['message'].apply(lambda x: len(word_tokenize(x)))
average_word_count = spam_df['word_count'].mean()

# Most frequent words
all_words = ' '.join(spam_df['message']).lower()
all_words_tokenized = word_tokenize(all_words)
word_freq = Counter(all_words_tokenized)
most_common_words = word_freq.most_common(5)

# Number of words that only appear once
unique_words_count = sum(1 for word, count in word_freq.items() if count == 1)

# Print the statistics
print(f"Total number of SMS messages: {total_messages}")
print(f"Number of spam messages: {spam_messages}")
print(f"Number of ham messages: {ham_messages}")
print(f"Average number of words per message: {average_word_count}")
print(f"5 most frequent words: {most_common_words}")
print(f"Number of words that only appear once: {unique_words_count}")


## Text Processing
Now, let's perform tokenization, lemmatization, and stemming using both `nltk` and `spaCy`.

### Tokenization

In [None]:
# Importing necessary libraries for text processing
import nltk
from nltk.tokenize import word_tokenize
import spacy
from time import time

# Download NLTK data
nltk.download('punkt')
nltk.download('wordnet')

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# NLTK Tokenization
start_time = time()
spam_df['nltk_tokens'] = spam_df['message'].apply(word_tokenize)
nltk_tokenization_time = time() - start_time

# spaCy Tokenization
start_time = time()
spam_df['spacy_tokens'] = spam_df['message'].apply(lambda x: [token.text for token in nlp(x)])
spacy_tokenization_time = time() - start_time

# Compare tokenization times
print(f"NLTK Tokenization Time: {nltk_tokenization_time} seconds")
print(f"spaCy Tokenization Time: {spacy_tokenization_time} seconds")


### Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

# Initialize lemmatizers
nltk_lemmatizer = WordNetLemmatizer()

# NLTK Lemmatization
start_time = time()
spam_df['nltk_lemmas'] = spam_df['nltk_tokens'].apply(lambda tokens: [nltk_lemmatizer.lemmatize(token) for token in tokens])
nltk_lemmatization_time = time() - start_time

# spaCy Lemmatization
start_time = time()
spam_df['spacy_lemmas'] = spam_df['spacy_tokens'].apply(lambda tokens: [token.lemma_ for token in nlp(' '.join(tokens))])
spacy_lemmatization_time = time() - start_time

# Compare lemmatization times
print(f"NLTK Lemmatization Time: {nltk_lemmatization_time} seconds")
print(f"spaCy Lemmatization Time: {spacy_lemmatization_time} seconds")


### Stemming

In [None]:
from nltk.stem import PorterStemmer

# Initialize stemmers
nltk_stemmer = PorterStemmer()

# NLTK Stemming
start_time = time()
spam_df['nltk_stems'] = spam_df['nltk_tokens'].apply(lambda tokens: [nltk_stemmer.stem(token) for token in tokens])
nltk_stemming_time = time() - start_time

# Compare stemming times
print(f"NLTK Stemming Time: {nltk_stemming_time} seconds")


### Comparison

In [None]:
# Comparison of nltk and spaCy implementations

print("Tokenization Comparison:")
print("NLTK produces tokens as a list of strings while spaCy produces tokens as token objects with rich attributes.")

print("Lemmatization Comparison:")
print("NLTK lemmatizer is simpler and language-agnostic but less powerful than spaCy's lemmatizer which handles context and part-of-speech better.")

print("Stemming Comparison:")
print("NLTK has a built-in stemmer, whereas spaCy does not include stemming as it focuses on more advanced NLP features.")


## Web Scraping

In [None]:
# Web Scraping using BeautifulSoup
import requests
from bs4 import BeautifulSoup

# URL of a public profile 
url = 'https://example.com/profile'  # Replace with The url of the person checking

# Send a GET request to the URL
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Extract text content from the page
profile_text = soup.get_text()

# Tokenization, Lemmatization, and Stemming of the scraped text
profile_tokens = word_tokenize(profile_text)
profile_lemmas = [nltk_lemmatizer.lemmatize(token) for token in profile_tokens]
profile_stems = [nltk_stemmer.stem(token) for token in profile_tokens]

# Print word statistics
profile_word_freq = Counter(profile_tokens)
print(f"Profile Text Word Count: {len(profile_tokens)}")
print(f"Profile Text Most Common Words: {profile_word_freq.most_common(5)}")


## WhatsApp Analysis

In [None]:
# Importing a .txt file of WhatsApp messages
whatsapp_file_path = 'path/to/whatsapp/messages.txt'  # Replace with actual file path of the person checking

# Load the WhatsApp messages
with open(whatsapp_file_path, 'r', encoding='utf-8') as file:
    whatsapp_text = file.read()

# Tokenization, Lemmatization, and Stemming of the WhatsApp messages
whatsapp_tokens = word_tokenize(whatsapp_text)
whatsapp_lemmas = [nltk_lemmatizer.lemmatize(token) for token in whatsapp_tokens]
whatsapp_stems = [nltk_stemmer.stem(token) for token in whatsapp_tokens]

# Print word statistics
whatsapp_word_freq = Counter(whatsapp_tokens)
print(f"WhatsApp Messages Word Count: {len(whatsapp_tokens)}")
print(f"WhatsApp Messages Most Common Words: {whatsapp_word_freq.most_common(5)}")


In [None]:

import re
import string
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
import gensim.downloader as api
from nltk.grammar import CFG
from nltk.parse.chart import ChartParser
import ace_tools as tools

# Load the file content
file_path = '/mnt/data/NLP_Assignment.ipynb'
with open(file_path, 'r', encoding='utf-8') as file:
    notebook = nbformat.read(file, as_version=4)

# Extracting text cells from the notebook
text_data = []
for cell in notebook.cells:
    if cell.cell_type == 'markdown' or cell.cell_type == 'code':
        text_data.append(cell.source)

# Join all text data into a single corpus
corpus = '\n'.join(text_data)

# White space tokenizer
white_space_tokens = corpus.split()

# Regex tokenizer
regex_tokens = re.findall(r'\b\w+\b', corpus)

# Simplified word tokenizer (splitting by space and punctuation)
word_tokens = re.findall(r'\b\w+\b', corpus)

# Sentence tokenizer (splitting by period)
sentences = corpus.split('.')

# Stemming (using PorterStemmer-like functionality)
stemmed_words = [re.sub(r'(ing|ed|s)$', '', word) for word in word_tokens]

# Lemmatization (using a simple rule-based approach)
lemmatized_words = [word[:-1] if word.endswith('s') else word for word in word_tokens]

# Remove stop words (manually defining a list of common stop words)
stop_words = {'and', 'or', 'but', 'if', 'while', 'with', 'a', 'an', 'the'}
filtered_words = [word for word in word_tokens if word.lower() not in stop_words]

# Bag of Words (BOW)
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(sentences)
bow_array = X_bow.toarray()

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(sentences)
tfidf_array = X_tfidf.toarray()

# Word Embedding by Word2Vec
word2vec_model = Word2Vec([word_tokens], vector_size=100, window=5, min_count=1, workers=4)
word2vec_similar = word2vec_model.wv.most_similar('text', topn=5)

# GloVe model
glove_vectors = api.load("glove-wiki-gigaword-100")
glove_similar = glove_vectors.most_similar('text', topn=5)

# Select 5 sentences for CYK tagging (dummy implementation)
selected_sentences = ["John saw the man with the telescope.", "Mary walked in the park.", "The dog ate a cat.", "A man saw a dog.", "Bob walked by the park."]

# Define a simple CFG
cfg_expanded = CFG.fromstring(\""
    S -> NP VP
    VP -> V NP | V NP PP | V PP
    PP -> P NP
    V -> "saw" | "ate" | "walked" | "is"
    NP -> "John" | "Mary" | "Bob" | Det N | Det N PP | "man" | "dog" | "cat" | "park" | "telescope"
    Det -> "a" | "an" | "the" | "my" | "A" | "The"
    N -> "man" | "dog" | "cat" | "telescope" | "park"
    P -> "in" | "on" | "by" | "with"
\"\"")

# Create a chart parser
parser_expanded = ChartParser(cfg_expanded)

# Remove punctuation from sentences
cleaned_sentences = [''.join([char for char in sentence if char not in string.punctuation]) for sentence in selected_sentences]

# Parse the cleaned sentences with expanded CFG
parsed_sentences_expanded = []
for sentence in cleaned_sentences:
    tokens = sentence.split()
    parse_tree = list(parser_expanded.parse(tokens))
    parsed_sentences_expanded.append(parse_tree)

# Display results
tools.display_dataframe_to_user("Tokenization and Normalization Results", pd.DataFrame({
    "White Space Tokens": white_space_tokens[:20],
    "Regex Tokens": regex_tokens[:20],
    "Word Tokens": word_tokens[:20],
    "Sentences": sentences[:5],
    "Stemmed Words": stemmed_words[:20],
    "Lemmatized Words": lemmatized_words[:20],
    "Filtered Words": filtered_words[:20]
}))

# Prepare results for feature extraction
{
    "BOW": bow_array,
    "TF-IDF": tfidf_array,
    "Word2Vec Similar": word2vec_similar,
    "GloVe Similar": glove_similar,
    "Selected Sentences": selected_sentences
}
