In [7]:
import pandas as pd
import fasttext
import numpy as np

# Load your dataset (replace 'your_dataset.csv' with your actual file)
df = pd.read_csv('Reddit_Tokenization.csv')

# Assuming 'lemmatized_comment' contains space-separated lemmatized tokens
df['tokens'] = df['lemmatized_comment'].apply(lambda x: x.split())

# Load a pre-trained FastText model (e.g., English)
model = fasttext.load_model('cc.en.300.bin')

# Get word vectors for each token
df['word_vectors'] = df['tokens'].apply(lambda tokens: [model.get_word_vector(token) for token in tokens])

# Function to average word vectors
def average_word_vectors(word_vectors):
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model.get_dimension())

# Aggregate word vectors for each document
df['document_vector'] = df['word_vectors'].apply(average_word_vectors)

# Display the DataFrame with document vectors
print(df[['comment', 'document_vector']])

                                                 comment  \
0      subsection retarded hungarians ohh boy brace l...   
1      hiii just got work Foundation and grounding ma...   
2                        wow guess soyboys every country   
3      owen benjamins soyboy song goes every country ...   
4       yall hear sumn means live small town rn for w...   
...                                                  ...   
22206          op stop faggot post videos next time hard   
22207  minute long video top hate champagne goes need...   
22208  clue whos ecelebs are point time  need get alo...   
22209                        didn’t insult you insult me   
22210                                         living lie   

                                         document_vector  
0      [0.014043219, -0.01809359, 0.017145459, 0.0806...  
1      [-0.0030388932, -0.035133556, 0.020659983, 0.0...  
2      [0.017362628, 0.005587179, 0.0297773, 0.109146...  
3      [0.018085241, 0.0011954829, 2.895947

In [8]:
# Flatten document vectors
df['document_vector_flat'] = df['document_vector'].apply(lambda vec: ','.join(map(str, vec)))

# Save to CSV
df.to_csv('encoded_dataset.csv', index=False, columns=['comment', 'hate_speech', 'lemmatized_comment', 'document_vector_flat'])

# Display the DataFrame with document vectors
print(df[['comment', 'document_vector_flat']].head())

                                             comment  \
0  subsection retarded hungarians ohh boy brace l...   
1  hiii just got work Foundation and grounding ma...   
2                    wow guess soyboys every country   
3  owen benjamins soyboy song goes every country ...   
4   yall hear sumn means live small town rn for w...   

                                document_vector_flat  
0  0.014043219,-0.01809359,0.017145459,0.08062436...  
1  -0.0030388932,-0.035133556,0.020659983,0.07383...  
2  0.017362628,0.005587179,0.0297773,0.109146975,...  
3  0.018085241,0.0011954829,2.8959475e-05,0.07601...  
4  0.023993038,-0.00060867134,0.005239945,0.05989...  
