# NLP Final Project 

### By: Idan Dunsky, Yaniv Kaveh-Shtul

# Imports

In [1]:
import spacy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from wordcloud import WordCloud
from collections import Counter
from ntscraper import Nitter
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from gensim.models import Word2Vec
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# Data

## Creating Dataset

In [None]:
scraper = Nitter()

In [None]:
def get_tweets(name,modes,no):
  """ get No. tweets from a specified user, by mode 

  Args:
      name (str): username
      modes (str): 1 of hashtag, user, term
      no (int): number of tweets to get

  Returns:
      DataFrame: a Dataframe containing all tweets
  """
  final_tweets = []

  tweets = scraper.get_tweets(name, mode = modes, number = no)

  for tweet in tweets['tweets']:
    data = [tweet['link'],tweet['text'],tweet['date'],tweet['stats']['likes'],tweet['stats']['comments']]
    final_tweets.append(data)
  data = pd.DataFrame(final_tweets, columns=['link', 'text','date','No_of_Likes','No_of_tweets'])
  return data

In [None]:
BBC = 'BBCNews'    # BBC news twitter account
NYTIMES = 'nytimes' # New York Times twitter account
MODE = 'user'       # scraping mode
NUM_OF_TWEETS = 900 # Maximun num of tweets allowed


In [None]:
bbc_df = get_tweets(BBC, MODE, NUM_OF_TWEETS)
nytimes_df = get_tweets(NYTIMES, MODE, NUM_OF_TWEETS)

In [None]:
bbc_df

In [None]:
nytimes_df

### Saving data to files

In [None]:
bbc_df.to_csv("bbc_tweets.csv")
nytimes_df.to_csv("nytimes_tweets.csv")

## Reading data

In [None]:
BBC = './bbc_tweets.csv'
NYT = './nytimes_tweets.csv'

In [None]:
bbc_df = pd.read_csv(BBC)
nyt_df = pd.read_csv(NYT)

In [None]:
import re
bbc_corpus = [re.sub(r'http\S+', '', x) for x in bbc_df['text']]
nyt_corpus = [re.sub(r'http\S+', '', x) for x in nyt_df['text']]

In [None]:
bbc_corpus

In [None]:
nyt_corpus

# Pre-processing Data

## Tokenization

In [None]:
# nlp model that will perform all actions
nlp = spacy.load('en_core_web_sm')

In [None]:
# helper function to tokenize corpus 
def process(nlp, corpus):
    doc = [nlp(sent) for sent in corpus]
    return doc

def tokenize(processed_corpus):
    tokens = []
    for sent in processed_corpus:
        tokens.append([token for token in sent if not token.is_punct and not token.is_stop and not token.is_space])
    return tokens


In [None]:
bbc_processed_corpus = process(nlp, bbc_corpus)
nyt_processed_corpus = process(nlp, nyt_corpus)

bbc_tokens = tokenize(bbc_processed_corpus)
nyt_tokens = tokenize(nyt_processed_corpus)

## Lemmatization

In [None]:
def lemmatize(tokens):
    lemmas = []
    for sent in tokens:
        for token in sent:
            lemmas.append(token.lemma_)
    return set(lemmas)

In [None]:
bbc_lemmas = lemmatize(bbc_tokens)
nyt_lemmas = lemmatize(nyt_tokens)

In [None]:
bbc_lemmas

In [None]:
nyt_lemmas

# Statistics

## Most frequent words

we will now use the naive manual way to search the most frequent words in the corpus

In [None]:
from nltk.corpus import stopwords

def get_most_frequent_words(corpus, top_n=5):
    """Seeks and returns a list the top_n most common words in a given corpus

    Args:
        corpus (list[str]): corpus
        top_n (int, optional): number of frequent words to seek. Defaults to 5.

    Returns:
        list[str]: list of most frequent words sorted in an descending order 
    """
    # Combine all documents into one string
    combined_text = ' '.join(corpus)
    
    # Tokenize the combined text (split by whitespace and remove non-alphanumeric characters)
    words = re.findall(r'\b\w+\b', combined_text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]

    # Count the frequency of each word
    word_counts = Counter(filtered_words)
    
    # Get the top N most common words
    most_common_words = word_counts.most_common(top_n)
    
    return most_common_words

In [None]:
def most_freq(corpus, name, number = 5):
    """print the {number} most frequent words in a given corpus 

    Args:
        corpus (list[str]): corpus
        name (str): corpus name
        number (int): number of words to print
    """
    # Get the number most frequent words
    top_words = get_most_frequent_words(corpus, top_n=number)

    # Display the results
    print(f"Top 5 most frequent words in the {name}:")
    for word, freq in top_words:
        print(f"{word}: {freq}")

In [None]:
most_freq(bbc_corpus, 'BBC')
print()
most_freq(nyt_corpus,'New-York Times')

## TF-IDF most frequent


TF-IDF (Term Frequency-Inverse Document Frequency) is a statistical measure used to evaluate the importance of a word in a document relative to a collection of documents (corpus). It combines two components: Term Frequency (TF), which measures how often a word appears in a document, and Inverse Document Frequency (IDF), which measures how common or rare a word is across the entire corpus. TF-IDF helps in identifying key terms and improving text analysis tasks like information retrieval and document classification.


we will now use the `TfidfVectorizer` in order to find the most frequent words in the corpus.

we will extract the sum of `tf` for each word by dividing the results of `TfidfVectorizer` by the `TfidfVectorizer.idf_` score, achieving the frequency of each word in the corpus.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


def tf_idf(corpus: list[str]):
    """calculates the tfidf score for each word in a given corpus

    Args:
        corpus (list[str]): the corpus for tfidf calculation

    Returns:
        tfidf_df DataFrame: data frame that holds the tfidf scores sum for each word 
    """
    # Initialize the vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')

    # Fit and transform the corpus
    X = vectorizer.fit_transform(corpus)

    # Get feature names (words)
    feature_names = vectorizer.get_feature_names_out()

    # Sum the TF scores for each word across all documents
    tf_scores = np.sum(X.toarray()/vectorizer.idf_, axis=0)

    # Create a DataFrame for better visualization
    tf_df = pd.DataFrame({'word': feature_names, 'tf_score': tf_scores})

    # Sort the DataFrame by TF-IDF score in descending order
    tf_df = tf_df.sort_values(by='tf_score', ascending=False)

    # Display the most frequent words based on TF-IDF scores
    return tf_df

In [None]:
bbc_tf = tf_idf(bbc_corpus)
nyt_tf = tf_idf(nyt_corpus)

In [None]:
bbc_tf

In [None]:
nyt_tf

### Frequency Bar Chart

In [None]:
def plot_tfidf_bar_chart(tf_df1, tf_df2, top_n=10):
    """ plot a bar chart of the top_n words with the highest tf score in two corpora
    Args:
        tf_df1 (DataFrame): tfidf dataframe for corpus #1 
        tf_df2 (DataFrame): tfidf dataframe for corpus #2 
        top_n (int, optional): number of words to plot. Defaults to 10.
    """
    # Get the top N words by TF score for each DataFrame
    top_tf_df1 = tf_df1.head(top_n)
    top_tf_df2 = tf_df2.head(top_n)
    
    # Combine both DataFrames for plotting
    combined_df = pd.concat([top_tf_df1, top_tf_df2])
    
    # Mark the source of each word for color coding
    combined_df['source'] = ['DF1']*top_n + ['DF2']*top_n

    # Sort combined DataFrame by TF-IDF score
    combined_df = combined_df.sort_values(by='tf_score', ascending=False)

    # Create a bar chart
    plt.figure(figsize=(12, 8))
    colors = ['skyblue' if source == 'DF1' else 'salmon' for source in combined_df['source']]
    plt.barh(combined_df['word'], combined_df['tf_score'], color=colors)
    plt.xlabel('TF Score')
    plt.title(f'Top {top_n} Words by TF Score')
    plt.gca().invert_yaxis()

    # Create a legend
    from matplotlib.lines import Line2D
    legend_elements = [Line2D([0], [0], color='skyblue', lw=4, label='BBC'),
                       Line2D([0], [0], color='salmon', lw=4, label='New-York Times')]
    plt.legend(handles=legend_elements, loc='lower right')

    plt.show()


plot_tfidf_bar_chart(bbc_tf, nyt_tf, top_n=5)

In [None]:
# merged_df = pd.merge(bbc_tf,nyt_tf, on='word', suffixes=('_bbc', '_nyt'))
# merged_df

### Word Cloud Frequency Chart

In [None]:
def plot_word_clouds(tf_df1, tf_df2, top_n=5):
    # Get the top N words by TF score for each DataFrame
    top_tf_df1 = tf_df1.head(top_n)
    top_tf_df2 = tf_df2.head(top_n)
    
    # Create dictionaries for word cloud generation
    word_freq1 = dict(zip(top_tf_df1['word'], top_tf_df1['tf_score']))
    word_freq2 = dict(zip(top_tf_df2['word'], top_tf_df2['tf_score']))
    
    # Create word clouds
    wordcloud1 = WordCloud(width=800, height=400, background_color='white', colormap='Blues').generate_from_frequencies(word_freq1)
    wordcloud2 = WordCloud(width=800, height=400, background_color='white', colormap='Reds').generate_from_frequencies(word_freq2)

    # Create word clouds with borders
    wordcloud1 = WordCloud(width=800, height=400, background_color='white', contour_color='black', contour_width=10).generate_from_frequencies(word_freq1)
    wordcloud2 = WordCloud(width=800, height=400, background_color='white', contour_color='black', contour_width=10).generate_from_frequencies(word_freq2)
    
    # Plot word clouds
    plt.figure(figsize=(14, 7))
    
    plt.subplot(1, 2, 1)
    plt.imshow(wordcloud1, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'{top_n} Top Words in BBC')
    
    
    plt.subplot(1, 2, 2)
    plt.imshow(wordcloud2, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'{top_n} Top Words in New-York Times')
    
    plt.show()


plot_word_clouds(bbc_tf, nyt_tf, top_n=5)

## Frequency Extraction using Word2Vec

Word2Vec is a type of neural network model used to learn vector representations of words from large text corpora. It operates in two main architectures: Continuous Bag of Words (CBOW) and Skip-gram. CBOW predicts a target word from its surrounding context words, while Skip-gram predicts context words from a given target word. These vector representations capture semantic meanings and relationships between words, making Word2Vec useful for tasks such as word similarity, sentiment analysis, and language translation.

Using Word2Vec to find the most frequent words in a corpus is not its primary function, as Word2Vec is designed to create word embeddings based on the context in which words appear rather than to count word frequencies. However, you can still retrieve the most frequent words from the vocabulary built during the training of a Word2Vec model.

### Explanation:
- Training the Model: The Word2Vec model is trained on the corpus to create word embeddings.
- Vocabulary and Counts: The key_to_index attribute of the model's wv (word vectors) object provides access to the vocabulary. The get_vecattr method retrieves the count of each word.
- Sorting and Displaying: The words are sorted by their counts in descending order, and the top N most frequent words are displayed.
This method leverages the internal vocabulary built during the Word2Vec training process to find the most frequent words in the corpus

In [None]:
def get_top_n_frequent_words(token_list, n=10, vector_size=100, window=5, min_count=1, workers=4):
    """
    Train a Word2Vec model on the given corpus and return the top N most frequent words.

    Parameters:
    - corpus: List of List of tokenized sentences (list of list of strings).
    - n: Number of top frequent words to return.
    - vector_size: Size of the word vectors.
    - window: Maximum distance between the current and predicted word within a sentence.
    - min_count: Ignores all words with a total frequency lower than this.
    - workers: Number of worker threads to train the model.

    Returns:
    - List of tuples (word, count) for the top N most frequent words.
    - Trained Word2Vec model
    """
    # Train the Word2Vec model
    model = Word2Vec(sentences=[[token.text for token in doc] for doc in token_list], vector_size=vector_size, window=window, min_count=min_count, workers=workers)
    
    # Get the vocabulary and their counts
    vocab = model.wv.key_to_index
    word_counts = {word: model.wv.get_vecattr(word, "count") for word in vocab}
    
    # Sort words by their frequency (count)
    sorted_word_counts = sorted(word_counts.items(), key=lambda item: item[1], reverse=True)
    
    # Get the top N most frequent words
    most_frequent_words = sorted_word_counts[:n]
    
    return most_frequent_words, model


def print_top_n_frequent_words(token_list, top_n=5):
    """ Get top_n most frequent words using Word2Vec model, and print it.

    Parameters:
        - token_list: List of tokenized words (list of strings) 
        - top_n: number of words to print. Defaults to 5. 
    
    Returns:
        - Trained Word2Vec model
    """
    top_words, model = get_top_n_frequent_words(token_list, n=top_n)
    
    print(f"\nTop {top_n} Most Frequent Words:")
    for word, count in top_words:
        print(f'Word: {word}, Count: {count}')
    return model

In [None]:
w2v_model_bbc = print_top_n_frequent_words(bbc_tokens, top_n=15)

In [None]:
w2v_model_nyt = print_top_n_frequent_words(nyt_tokens, top_n=15)


## AutoEncoder Significance Extraction

An autoencoder is a type of neural network designed for unsupervised learning that compresses input data into a lower-dimensional representation and then reconstructs it. It consists of two main parts: an encoder that reduces the data's dimensionality and a decoder that reconstructs the original data from the compressed form. Autoencoders are used for tasks such as dimensionality reduction, feature learning, and anomaly detection.

While autoencoders are not typically used for identifying the most significant words in a corpus, it is possible to adapt them for this purpose with some modifications.


### Explanation:
- We start by tokenizing our corpus and training a Word2Vec model to get word embeddings.
- We create an autoencoder with an input layer, a hidden layer (encoding), and an output layer (decoding).
- The autoencoder is trained to reconstruct the word embeddings.
- After training, we use the autoencoder to reconstruct the embeddings and calculate the reconstruction error for each word.
- Words with higher reconstruction errors are considered potentially more significant, as they might be harder for the autoencoder to encode and decode accurately.
- We sort the words based on their reconstruction error and print the top 10.

In [None]:
def autoencoder_significance_analysis(w2v_model, tokenized_corpus, top_n=10):
    
    # Create word embeddings for each word in the corpus
    word_embeddings = {}
    for doc in tokenized_corpus:
        for word in doc:
            if word not in word_embeddings:
                word_embeddings[word.text] = w2v_model.wv[word.text]

    # Convert word embeddings to a list
    embedding_list = list(word_embeddings.values())
    embedding_matrix = np.array(embedding_list)

    # Define the autoencoder
    input_dim = embedding_matrix.shape[1]
    encoding_dim = 32

    input_layer = Input(shape=(input_dim,))
    encoded = Dense(encoding_dim, activation='relu')(input_layer)
    decoded = Dense(input_dim, activation='linear')(encoded)

    autoencoder = Model(input_layer, decoded)
    autoencoder.compile(optimizer='adam', loss='mse')

    # Train the autoencoder
    autoencoder.fit(embedding_matrix, embedding_matrix, epochs=100, batch_size=16, shuffle=True, verbose=0)

    # Get the reconstructed embeddings
    reconstructed_embeddings = autoencoder.predict(embedding_matrix)

    # Calculate reconstruction error for each word
    reconstruction_errors = np.mean(np.square(embedding_matrix - reconstructed_embeddings), axis=1)

    # Create a dictionary of words and their reconstruction errors
    word_errors = {word: error for word, error in zip(word_embeddings.keys(), reconstruction_errors)}

    # Sort words by reconstruction error (higher error might indicate more significant words)
    sorted_words = sorted(word_errors.items(), key=lambda x: x[1], reverse=True)

    # Print the top 10 words with highest reconstruction error
    print(f"\nTop {top_n} potentially significant words based on reconstruction error:\n")
    for word, error in sorted_words[:top_n]:
        print(f"{word}: {error}")


In [None]:
autoencoder_significance_analysis(w2v_model_bbc, bbc_tokens)

In [None]:
autoencoder_significance_analysis(w2v_model_nyt, nyt_tokens)

##################################### SKIPPED SOME STUFF ( COMPARISON ) ############################################ 

##################################### SKIPPED SOME STUFF ( COMPARISON ) ############################################ 

##################################### SKIPPED SOME STUFF ( COMPARISON ) ############################################ 

##################################### SKIPPED SOME STUFF ( COMPARISON ) ############################################ 

##################################### SKIPPED SOME STUFF ( COMPARISON ) ############################################ 

# NER extraction

In [None]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(f"Word: {ent.text: <35} NER: {ent.label_: <35} Explanation:" +  str(spacy.explain(ent.label_)))
    else:
        print("No named entities found.")
        
def print_NER(corpus):
    for i in range(len(corpus)):
        show_ents(corpus[i])
    
        

In [None]:
print_NER(bbc_processed_corpus)