# Word2Vector (Gensim)

In [None]:
import pandas as pd
from gensim.models.word2vec import Word2Vec
import ast  # Used to convert strings to lists

# Read the CSV file containing normalized data
df = pd.read_csv('normalized_data.csv')

# Convert each row's string into a list
corpus = df['normalization'].apply(ast.literal_eval).tolist()

In [None]:
model = Word2Vec(corpus, compute_loss=True, vector_size=250, workers =10, min_count = 5, window=5)
training_loss = model.get_latest_training_loss()
print(training_loss)

In [None]:
# Get word vectors
king_vector = model.wv['creat']
man_vector = model.wv['financi']
# queen_vector = model.wv['harri']  # Uncomment if needed
woman_vector = model.wv['decis']

# Calculate vector relationship
vector_relation = king_vector - man_vector + woman_vector

# Find the closest words to the calculated vector
similar_words = model.wv.similar_by_vector(vector_relation, topn=5)

# Output results
print(similar_words)

In [None]:
def most_similar(w2v_model, words, topn=20):
    similar_df = pd.DataFrame()
    for word in words:
        try:
            similar_words = pd.DataFrame(w2v_model.wv.most_similar(word, topn=topn), columns=[word, 'cos'])
            similar_df = pd.concat([similar_df, similar_words], axis=1)
        except:
            print(word, "not found in Word2Vec model!")
    return similar_df

In [None]:
most_similar(model, ['trump','harri','elect','presid','maga','tweet','cycl','ground','head','strategi','us'])

In [None]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# Define a list of keywords for clustering
keys = ['trump', 'elect', 'presid', 'maga', 'cycl', 'ground', 'head', 'strategi', 'us', 'misinform', 'result', 'news']

embedding_clusters = []
word_clusters = []

# Iterate through each key word
for word in keys:
    embeddings = []
    words = []
    # Get the most similar words for each key word
    for similar_word, _ in model.wv.most_similar(word, topn=30):
        words.append(similar_word)
        embeddings.append(model.wv[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)
    
# Extract word vectors
embedding_clusters = np.array(embedding_clusters)

# Perform t-SNE dimensionality reduction
n, m, k = embedding_clusters.shape  # n is the number of key words, k is the embedding dimension
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

In [None]:
from sklearn.manifold import TSNE
import numpy as np

embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline


def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=a, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=8)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    if filename:
        plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
    plt.show()


tsne_plot_similar_words('Similar words from tweets', keys, embeddings_en_2d, word_clusters, 0.7,
                        'similar_words.png')

In [None]:
for index, word in enumerate(model.wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")

In [None]:
model.wv.most_similar("trump")

In [None]:
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import random

# Initialize lists for words and their embeddings
words_ak = []
embeddings_ak = []

# Iterate through the vocabulary using key_to_index
for word in list(model.wv.key_to_index):  # Use key_to_index to get the vocabulary
    embeddings_ak.append(model.wv[word])  # Get the word vector
    words_ak.append(word)

# Convert embeddings_ak to a NumPy array
embeddings_ak = np.array(embeddings_ak)

# Perform t-SNE for dimensionality reduction
tsne_ak_2d = TSNE(perplexity=30, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_ak_2d = tsne_ak_2d.fit_transform(embeddings_ak)  

# Define a function to plot the t-SNE results
def tsne_plot_2d(label, embeddings, words=[], a=1):
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, 1))
    x = embeddings[:, 0]
    y = embeddings[:, 1]
    plt.scatter(x, y, c=colors, alpha=a, label=label)

    # Randomly select 50 indices for labeling
    selected_indices = random.sample(range(len(words)), 50)
    for i in selected_indices:
        plt.annotate(words[i], alpha=0.7, xy=(x[i], y[i]), xytext=(10, 4), 
                     textcoords='offset points', ha='right', va='bottom', size=12)

    plt.legend(loc=4)
    plt.grid(True)
    plt.savefig("hhh.png", format='png', dpi=450, bbox_inches='tight')  # Save the plot as an image
    plt.show()  # Display the plot

# Call the plotting function for all key words
tsne_plot_2d('All key words', embeddings_ak_2d, words_ak, a=0.1)

## K Means Clustering

In [None]:
import pandas as pd  # Import pandas library for data processing and DataFrame operations
import numpy as np  # Import NumPy library for numerical computations
from sklearn.cluster import KMeans  # Import KMeans from sklearn for clustering analysis

def word_cluster(wv, n_clusters=80):
    """
    Perform KMeans clustering on the given word vector model and save the results to an Excel file.

    Parameters:
    wv -- Word vector model (Word2Vec or similar model)
    n_clusters -- Number of clusters to form (default is 80)
    """
    
    # Calculate the L2 norm (magnitude) of each word vector and reshape it to (-1, 1)
    uv = np.linalg.norm(wv.vectors, axis=1).reshape(-1, 1)
    
    # Normalize the word vectors
    wv.vectors = wv.vectors / uv

    # Perform KMeans clustering and get the labels for each word
    labels = KMeans(n_clusters).fit(wv.vectors).labels_

    # Create a DataFrame combining words and their corresponding cluster labels
    df = pd.DataFrame([(w, labels[e]) for e, w in enumerate(wv.index_to_key)], columns=['word', 'label'])
    
    # Sort the DataFrame by cluster labels
    df.sort_values(by='label', inplace=True)
    
    # Save the results to an Excel file without using the encoding parameter
    df.to_excel('word_cluster_.xlsx', index=False)

# Call the function to perform word clustering
word_cluster(model.wv)  # model.wv is a trained word vector model