In [None]:
# Word Tokenization with NLTK
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# Ensure text column is of type string
train_data['text'] = train_data['text'].astype(str)

# Download required NLTK resources (if not already downloaded)
nltk.download('punkt')

# Apply word tokenization to each text entry in the DataFrame
train_data['tokenized_text'] = train_data['text'].apply(word_tokenize)

# Display the DataFrame with the new tokenized_text column
print(train_data)


Embedding

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Drop rows with missing values in the 'text' column
train_data.dropna(subset=['text'], inplace=True)

# Function to compute TF-IDF embeddings
def compute_tfidf_embeddings(corpus):
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    tfidf_embeddings = tfidf_vectorizer.fit_transform(corpus)
    return tfidf_embeddings

# Extract text data and compute TF-IDF embeddings
text_data = train_data['text'].values
tfidf_embeddings = compute_tfidf_embeddings(text_data)

# Print the TF-IDF embeddings
print(tfidf_embeddings)


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Step 1: TF-IDF Encoding
def compute_tfidf_embeddings(corpus):
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    tfidf_embeddings = tfidf_vectorizer.fit_transform(corpus)
    return tfidf_embeddings

# Extract text and target variable
texts = train_data['text'].values
target = train_data['hd'].values

# Compute TF-IDF embeddings
tfidf_embeddings = compute_tfidf_embeddings(texts)

# Step 2: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_embeddings, target, test_size=0.2, random_state=42)

# Step 3: Train Random Forest model
random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train, y_train)

# Step 4: Predictions and Evaluation
y_pred = random_forest_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


In [None]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Extract text data from the training DataFrame
text_data = train_data['text'].values

# Replace NaN values with empty strings
text_data = np.where(pd.isnull(text_data), '', text_data)

# Tokenize the text data
tokenized_texts = [word_tokenize(text) for text in text_data]

# Train a Word2Vec model on the tokenized texts
word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

# Function to compute average Word2Vec embeddings for each document
def average_word2vec(tokens, model, vocab, vector_dim):
    vec_sum = np.zeros((vector_dim,), dtype="float32")
    num_tokens = 0
    for token in tokens:
        if token in vocab:
            num_tokens += 1
            vec_sum = np.add(vec_sum, model.wv[token])
    if num_tokens > 0:
        vec_sum = np.divide(vec_sum, num_tokens)
    return vec_sum

# Generate average embeddings for each text in the training set
vocabulary_set = set(word2vec_model.wv.index_to_key)
text_embeddings = np.array([average_word2vec(tokens, word2vec_model, vocabulary_set, 100) for tokens in tokenized_texts])

# Print the resulting embeddings
print(text_embeddings)


In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Display the first two rows of the DataFrame and its info
print(train_data.head(2))
train_data.info()

# Ensure 'label' column is treated as string type
train_data['text'] = train_data['label'].astype(str)

# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse=False, drop='first')

# Fit and transform the 'label' column to one-hot encoded format
encoded_labels = encoder.fit_transform(train_data[['label']])

# Print the one-hot encoded array
print(encoded_labels)
