In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
import re
from gensim.models import Word2Vec

In [8]:
# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load dataset
# Assume your dataset is loaded into a pandas dataframe named `df`
# The dataframe should have two columns: 'text' and 'type'
# df = pd.read_csv('your_dataset.csv') 

file_path = "C:/Users/Ryo/OneDrive/Desktop/Master Thesis/study/study1/raw/sample_forFeeding.xlsx"
df = pd.read_excel(file_path, sheet_name='with_label')
df = df.rename(columns = {"type" : "type",
                          "posts_filtered" : "text"})

# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Preprocess text
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ryo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ryo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ryo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove links
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [10]:
df['text'] = df['text'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['type'] = label_encoder.fit_transform(df['type'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['type'], test_size=0.3, random_state=42, stratify=df['type'])

# Vectorize text using Word2Vec
tokenized_train = [text.split() for text in X_train]
tokenized_test = [text.split() for text in X_test]

word2vec_model = Word2Vec(sentences=tokenized_train, vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.train(tokenized_train, total_examples=len(tokenized_train), epochs=50)

(13216510, 14624000)

In [13]:
class Word2VecEstimator(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100, window=5, min_count=1, workers=4, epochs=50):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.epochs = epochs
        self.model = None

    def fit(self, X, y=None):
        self.model = Word2Vec(sentences=X, vector_size=self.vector_size, window=self.window, 
                              min_count=self.min_count, workers=self.workers)
        self.model.train(X, total_examples=len(X), epochs=self.epochs)
        return self

    def transform(self, X):
        return np.array([self.get_average_word2vec(s) for s in X])

    def get_average_word2vec(self, tokens_list):
        if len(tokens_list) < 1:
            return np.zeros(self.vector_size)
        vec = []
        for token in tokens_list:
            try:
                vec.append(self.model.wv[token])
            except KeyError:
                continue
        return np.mean(vec, axis=0) if vec else np.zeros(self.vector_size)


# Define parameter grid
param_grid = {
    'vector_size': [50, 100, 200],
    'window': [3, 5, 7],
    'min_count': [1, 2, 3],
    'epochs': [30, 50, 70]
}

In [14]:
# Create a dummy target variable (not used, but required by GridSearchCV)
dummy_y = np.zeros(len(tokenized_train))

# Perform grid search
grid_search = GridSearchCV(Word2VecEstimator(), param_grid, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
grid_search.fit(tokenized_train, dummy_y)  # Use dummy_y instead of tokenized_train

# Get best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Train Word2Vec model with best parameters
word2vec_model = Word2Vec(sentences=tokenized_train, vector_size=best_params['vector_size'], 
                          window=best_params['window'], min_count=best_params['min_count'], workers=4)
word2vec_model.train(tokenized_train, total_examples=len(tokenized_train), epochs=best_params['epochs'])

Fitting 3 folds for each of 81 candidates, totalling 243 fits


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan]


Best parameters: {'epochs': 30, 'min_count': 1, 'vector_size': 50, 'window': 3}


(7928675, 8774400)

In [15]:
# Define function to get average word vectors
def get_average_word2vec(tokens_list, vector, k=best_params['vector_size']):
    if len(tokens_list) < 1:
        return np.zeros(k)
    vec = []
    for token in tokens_list:
        try:
            vec.append(vector[token])
        except KeyError:
            continue
    return np.mean(vec, axis=0) if vec else np.zeros(k)

In [16]:
# Create word vectors for train and test sets
X_train_word2vec = [get_average_word2vec(s, word2vec_model.wv) for s in tokenized_train]
X_test_word2vec = [get_average_word2vec(s, word2vec_model.wv) for s in tokenized_test]

In [17]:
#X_train_word2vec = [get_average_word2vec(s, word2vec_model.wv) for s in tokenized_train]
#X_test_word2vec = [get_average_word2vec(s, word2vec_model.wv) for s in tokenized_test]

# Handle imbalanced data with SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_word2vec, y_train)

# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(kernel='linear'),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    'CatBoost': CatBoostClassifier(verbose=0)
}

In [18]:
# Train and evaluate models
results = {}
for model_name, model in models.items():
    model.fit(X_train_smote, y_train_smote)
    y_pred = model.predict(X_test_word2vec)
    f1 = f1_score(y_test, y_pred, average='weighted')
    results[model_name] = f1

# Print results
for model_name, f1 in results.items():
    print(f"{model_name}: F1 Score = {f1:.4f}")

Logistic Regression: F1 Score = 0.1030
SVM: F1 Score = 0.0967
Random Forest: F1 Score = 0.0755
XGBoost: F1 Score = 0.1132
CatBoost: F1 Score = 0.0757
