# NLP Assignment
This notebook provides hands-on practice with text processing techniques in Python, including tokenization, lemmatization, and stemming. It also includes loading, analyzing, and scraping textual data from different sources.

## Setup
1. Install a Python programming environment (e.g., PyCharm, Jupyter Notebook).
2. Install the necessary Python libraries: `nltk`, `spaCy`, `BeautifulSoup`.

```python
!pip install nltk spacy beautifulsoup4

import nltk
nltk.download('punkt')
nltk.download('wordnet')

import spacy
spacy.cli.download('en_core_web_sm')
```

## Data Loading & Basic Analysis
First, let's load the dataset and print the required basic statistics.

In [None]:
# Import necessary libraries
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import spacy

# Load the spam.csv dataset
file_path = 'spam.csv'  # Replace with your actual file path
spam_df = pd.read_csv(file_path, encoding='latin-1')

# Drop unnecessary columns
spam_df = spam_df[['v1', 'v2']]
spam_df.columns = ['label', 'message']

# Total number of SMS messages
total_messages = len(spam_df)

# Number of spam/ham messages
spam_messages = len(spam_df[spam_df['label'] == 'spam'])
ham_messages = len(spam_df[spam_df['label'] == 'ham'])

# Average number of words per message
spam_df['word_count'] = spam_df['message'].apply(lambda x: len(word_tokenize(x)))
average_word_count = spam_df['word_count'].mean()

# Most frequent words
all_words = ' '.join(spam_df['message']).lower()
all_words_tokenized = word_tokenize(all_words)
word_freq = Counter(all_words_tokenized)
most_common_words = word_freq.most_common(5)

# Number of words that only appear once
unique_words_count = sum(1 for word, count in word_freq.items() if count == 1)

# Print the statistics
print(f"Total number of SMS messages: {total_messages}")
print(f"Number of spam messages: {spam_messages}")
print(f"Number of ham messages: {ham_messages}")
print(f"Average number of words per message: {average_word_count}")
print(f"5 most frequent words: {most_common_words}")
print(f"Number of words that only appear once: {unique_words_count}")


## Text Processing
Now, let's perform tokenization, lemmatization, and stemming using both `nltk` and `spaCy`.

### Tokenization

In [None]:
# Importing necessary libraries for text processing
import nltk
from nltk.tokenize import word_tokenize
import spacy
from time import time

# Download NLTK data
nltk.download('punkt')
nltk.download('wordnet')

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# NLTK Tokenization
start_time = time()
spam_df['nltk_tokens'] = spam_df['message'].apply(word_tokenize)
nltk_tokenization_time = time() - start_time

# spaCy Tokenization
start_time = time()
spam_df['spacy_tokens'] = spam_df['message'].apply(lambda x: [token.text for token in nlp(x)])
spacy_tokenization_time = time() - start_time

# Compare tokenization times
print(f"NLTK Tokenization Time: {nltk_tokenization_time} seconds")
print(f"spaCy Tokenization Time: {spacy_tokenization_time} seconds")


### Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

# Initialize lemmatizers
nltk_lemmatizer = WordNetLemmatizer()

# NLTK Lemmatization
start_time = time()
spam_df['nltk_lemmas'] = spam_df['nltk_tokens'].apply(lambda tokens: [nltk_lemmatizer.lemmatize(token) for token in tokens])
nltk_lemmatization_time = time() - start_time

# spaCy Lemmatization
start_time = time()
spam_df['spacy_lemmas'] = spam_df['spacy_tokens'].apply(lambda tokens: [token.lemma_ for token in nlp(' '.join(tokens))])
spacy_lemmatization_time = time() - start_time

# Compare lemmatization times
print(f"NLTK Lemmatization Time: {nltk_lemmatization_time} seconds")
print(f"spaCy Lemmatization Time: {spacy_lemmatization_time} seconds")


### Stemming

In [None]:
from nltk.stem import PorterStemmer

# Initialize stemmers
nltk_stemmer = PorterStemmer()

# NLTK Stemming
start_time = time()
spam_df['nltk_stems'] = spam_df['nltk_tokens'].apply(lambda tokens: [nltk_stemmer.stem(token) for token in tokens])
nltk_stemming_time = time() - start_time

# Compare stemming times
print(f"NLTK Stemming Time: {nltk_stemming_time} seconds")


### Comparison

In [None]:
# Comparison of nltk and spaCy implementations

print("Tokenization Comparison:")
print("NLTK produces tokens as a list of strings while spaCy produces tokens as token objects with rich attributes.")

print("Lemmatization Comparison:")
print("NLTK lemmatizer is simpler and language-agnostic but less powerful than spaCy's lemmatizer which handles context and part-of-speech better.")

print("Stemming Comparison:")
print("NLTK has a built-in stemmer, whereas spaCy does not include stemming as it focuses on more advanced NLP features.")


## Web Scraping

In [None]:
# Web Scraping using BeautifulSoup
import requests
from bs4 import BeautifulSoup

# URL of a public profile 
url = 'https://example.com/profile'  # Replace with The url of the person checking

# Send a GET request to the URL
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Extract text content from the page
profile_text = soup.get_text()

# Tokenization, Lemmatization, and Stemming of the scraped text
profile_tokens = word_tokenize(profile_text)
profile_lemmas = [nltk_lemmatizer.lemmatize(token) for token in profile_tokens]
profile_stems = [nltk_stemmer.stem(token) for token in profile_tokens]

# Print word statistics
profile_word_freq = Counter(profile_tokens)
print(f"Profile Text Word Count: {len(profile_tokens)}")
print(f"Profile Text Most Common Words: {profile_word_freq.most_common(5)}")


## WhatsApp Analysis

In [None]:
# Importing a .txt file of WhatsApp messages
whatsapp_file_path = 'path/to/whatsapp/messages.txt'  # Replace with actual file path of the person checking

# Load the WhatsApp messages
with open(whatsapp_file_path, 'r', encoding='utf-8') as file:
    whatsapp_text = file.read()

# Tokenization, Lemmatization, and Stemming of the WhatsApp messages
whatsapp_tokens = word_tokenize(whatsapp_text)
whatsapp_lemmas = [nltk_lemmatizer.lemmatize(token) for token in whatsapp_tokens]
whatsapp_stems = [nltk_stemmer.stem(token) for token in whatsapp_tokens]

# Print word statistics
whatsapp_word_freq = Counter(whatsapp_tokens)
print(f"WhatsApp Messages Word Count: {len(whatsapp_tokens)}")
print(f"WhatsApp Messages Most Common Words: {whatsapp_word_freq.most_common(5)}")



## Tokenization
In this section, we will apply different tokenization methods including white space tokenizer, regex tokenizer, word tokenizer, and sentence tokenizer.


In [None]:

# Tokenization Section

# White Space Tokenizer
def whitespace_tokenizer(text):
    return text.split()

spam_df['whitespace_tokens'] = spam_df['message'].apply(whitespace_tokenizer)

# Regex Tokenizer
import re

def regex_tokenizer(text):
    return re.findall(r'\b\w+\b', text)

spam_df['regex_tokens'] = spam_df['message'].apply(regex_tokenizer)

# Word Tokenizer (NLTK)
from nltk.tokenize import word_tokenize

spam_df['word_tokens'] = spam_df['message'].apply(word_tokenize)

# Sentence Tokenizer (NLTK)
from nltk.tokenize import sent_tokenize

spam_df['sentence_tokens'] = spam_df['message'].apply(sent_tokenize)

# Display tokenized results
print(spam_df[['message', 'whitespace_tokens', 'regex_tokens', 'word_tokens', 'sentence_tokens']].head())



## Normalization
In this section, we will apply normalization techniques including stemming and lemmatization.


In [None]:

# Normalization Section

# Stemming
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

spam_df['stemmed_tokens'] = spam_df['word_tokens'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])

# Lemmatization
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

spam_df['lemmatized_tokens'] = spam_df['word_tokens'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

# Display normalization results
print(spam_df[['message', 'stemmed_tokens', 'lemmatized_tokens']].head())



## Stop Words Removal
In this section, we will remove stop words including conjunctions and articles.


In [None]:

# Stop Words Removal Section

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [token for token in tokens if token.lower() not in stop_words]

spam_df['tokens_no_stopwords'] = spam_df['word_tokens'].apply(remove_stopwords)

# Display results after stop words removal
print(spam_df[['message', 'tokens_no_stopwords']].head())



## Feature Extraction
In this section, we will apply feature extraction techniques including Bag of Words (BOW), TF-IDF, and Word2Vec.


In [None]:

# Feature Extraction Section

# Bag of Words (BOW)
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(spam_df['message'])

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(spam_df['message'])

# Word2Vec
from gensim.models import Word2Vec

sentences = [word_tokenize(message) for message in spam_df['message']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
word2vec_vectors = word2vec_model.wv

# Display feature extraction results
print("Bag of Words shape:", X_bow.shape)
print("TF-IDF shape:", X_tfidf.shape)
print("Word2Vec vocab size:", len(word2vec_vectors))



## GloVe Explanation and Application
GloVe (Global Vectors for Word Representation) is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.

To apply GloVe to our data, we need to download pre-trained GloVe embeddings and map our words to these vectors. Since GloVe embeddings are pre-trained, they offer richer semantic meaning than training a Word2Vec model on a small corpus.


In [None]:

import numpy as np

# Load GloVe embeddings
glove_embeddings = {}
with open("glove.6B.100d.txt", "r", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        glove_embeddings[word] = vector

# Function to get GloVe vector for a word
def get_glove_vector(word):
    return glove_embeddings.get(word, np.zeros(100))

# Applying GloVe to our dataset
spam_df['glove_vectors'] = spam_df['tokens_no_stopwords'].apply(lambda tokens: [get_glove_vector(token) for token in tokens])

# Display GloVe vector results
print(spam_df[['message', 'glove_vectors']].head())



## CYK Tagging
In this section, we will select 5 sentences and apply tagging using the CYK algorithm.


In [None]:

# CYK Tagging Section
# Select 5 sentences for CYK tagging
sample_sentences = spam_df['message'].sample(5).tolist()

# Example CYK tagging (simplified version)
# Here we use a dummy grammar for demonstration; replace with an actual grammar for real use cases
grammar = {
    'S': [['NP', 'VP']],
    'NP': [['Det', 'N']],
    'VP': [['V', 'NP']],
    'Det': ['a', 'the'],
    'N': ['dog', 'cat'],
    'V': ['chased', 'sat']
}

def cyk_parse(sentence, grammar):
    words = sentence.split()
    n = len(words)
    table = [[set() for _ in range(n)] for _ in range(n)]
    
    for j in range(n):
        for lhs, rhs in grammar.items():
            if words[j] in rhs:
                table[j][j].add(lhs)
        for i in range(j-1, -1, -1):
            for k in range(i, j):
                for lhs, rhs in grammar.items():
                    if len(rhs) == 2 and rhs[0] in table[i][k] and rhs[1] in table[k+1][j]:
                        table[i][j].add(lhs)
    
    return table

# Apply CYK to sample sentences
cyk_results = [cyk_parse(sentence, grammar) for sentence in sample_sentences]

# Display CYK results
for sentence, result in zip(sample_sentences, cyk_results):
    print(f"Sentence: {sentence}")
    print(f"CYK Parse Table: {result}")
