In [64]:
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

In [65]:
data_dir = 'data_reviews'

# Load BERT embeddings
x_train_BERT = np.load(os.path.join(data_dir, 'x_train_BERT_embeddings.npy'))
x_test_BERT = np.load(os.path.join(data_dir, 'x_test_BERT_embeddings.npy'))

x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))
x_test_df = pd.read_csv(os.path.join(data_dir, 'x_test.csv'))

In [66]:
# Extract sentiment labels and review sources
train_labels = y_train_df['is_positive_sentiment'].values
review_sources = x_train_df['website_name'].values  # Assuming 'website_name' is the column with review type

# Split data based on the review source (e.g., 'amazon', 'imdb', 'yelp')
train_amazon_indices = np.where(review_sources == 'amazon')[0]
train_imdb_indices = np.where(review_sources == 'imdb')[0]
train_yelp_indices = np.where(review_sources == 'yelp')[0]

# Get corresponding embeddings and labels for each type
x_train_amazon = x_train_BERT[train_amazon_indices]
y_train_amazon = train_labels[train_amazon_indices]

x_train_imdb = x_train_BERT[train_imdb_indices]
y_train_imdb = train_labels[train_imdb_indices]

x_train_yelp = x_train_BERT[train_yelp_indices]
y_train_yelp = train_labels[train_yelp_indices]

In [67]:
# Define hyperparameter grid for Logistic Regression
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2']         # Penalty type
}

# Logistic Regression model setup
log_reg = LogisticRegression(solver='liblinear')

# Define AUROC as the evaluation metric
scorer = make_scorer(roc_auc_score, greater_is_better=True)
skf = StratifiedKFold(n_splits=5)

In [68]:
# Function to run grid search for a specific dataset
def run_grid_search(x_train, y_train):
    grid_search = GridSearchCV(log_reg, param_grid=param_grid, scoring=scorer, cv=skf, n_jobs=-1)
    grid_search.fit(x_train, y_train)
    return grid_search

# Run grid search for each review type
grid_amazon = run_grid_search(x_train_amazon, y_train_amazon)
grid_imdb = run_grid_search(x_train_imdb, y_train_imdb)
grid_yelp = run_grid_search(x_train_yelp, y_train_yelp)

In [69]:
# Assuming you have already loaded x_test_BERT and x_test_df as in your previous code
# Extract website names from the test set
test_sources = x_test_df['website_name'].values

# Split the test data based on the review source
test_amazon_indices = np.where(test_sources == 'amazon')[0]
test_imdb_indices = np.where(test_sources == 'imdb')[0]
test_yelp_indices = np.where(test_sources == 'yelp')[0]

# Get corresponding embeddings for each type
x_test_amazon = x_test_BERT[test_amazon_indices]
x_test_imdb = x_test_BERT[test_imdb_indices]
x_test_yelp = x_test_BERT[test_yelp_indices]


In [70]:
# Use the best models to make predictions
predictions_amazon = grid_amazon.best_estimator_.predict(x_test_amazon)
predictions_imdb = grid_imdb.best_estimator_.predict(x_test_imdb)
predictions_yelp = grid_yelp.best_estimator_.predict(x_test_yelp)

# If you need probabilities instead of class predictions:
probabilities_amazon = grid_amazon.best_estimator_.predict_proba(x_test_amazon)[:, 1]
probabilities_imdb = grid_imdb.best_estimator_.predict_proba(x_test_imdb)[:, 1]
probabilities_yelp = grid_yelp.best_estimator_.predict_proba(x_test_yelp)[:, 1]


In [71]:
all_predictions = np.concatenate([probabilities_amazon, probabilities_imdb, probabilities_yelp])
np.savetxt('yproba1_test(part2).txt', all_predictions, delimiter='\n', fmt='%.6f')  # All predictions in one file