In [114]:
import pandas as pd
import pickle
import dvc.api

from dvclive import Live
from tqdm import tqdm

In [115]:
def add_features(procd_df, tfidf_corpus, topics_dist, bert_embeddings=None):
    from textblob import TextBlob

     # Add review length
    procd_df['review_length'] = procd_df['review'].apply(lambda x: len(x.split()))

    # Add sentiment score
    procd_df['sentiment_score'] = procd_df['review'].apply(lambda x: TextBlob(x).sentiment.polarity)

    # Assert that the length of the features are equal to the length of the data
    assert (len(procd_df) == len(tfidf_corpus) == len(topics_dist)), \
        print(f"""procd_df: {len(procd_df)} \n
              tfidf_corpus: {len(tfidf_corpus)} \n  
              topics_dist: {len(topics_dist)}""")
    
    # Add tfidf of reviews
    procd_df['tfidf'] = tfidf_corpus

    # Add topic distribution of revies
    procd_df = pd.concat([procd_df, topics_dist], axis=1)

    # Add bert for reviews
    #if bert_embeddings:
    #    procd_df['bert'] = bert_embeddings
    
    return procd_df


In [123]:
def predict_rating(X_train, X_test, y_train, y_test, features, model, params):
    import numpy as np
    from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
    from scipy.sparse import csr_matrix, vstack, hstack
    from sklearn.linear_model import LinearRegression
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.svm import SVR
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    from tqdm import tqdm

    # Scale the features
    final_features_csr_train = []
    final_features_csr_test = []
    for feature in features:
        scaler = StandardScaler(with_mean=False)
        if feature == 'tfidf':
            #scaler = MinMaxScaler()
            # Determine the maximum index across both training and test sets
            max_index = max(
                max(idx for vec in X_train[feature] for idx, _ in vec),
                max(idx for vec in X_test[feature] for idx, _ in vec)
            )

            batch_size = 1000

            # Function to convert a batch of Gensim-style TF-IDF vectors to a CSR matrix
            def gensim_to_csr(batch, num_terms):
                data, rows, cols = [], [], []
                for i, doc in enumerate(batch):
                    for term_id, value in doc:
                        rows.append(i)
                        cols.append(term_id)
                        data.append(value)
                return csr_matrix((data, (rows, cols)), shape=(len(batch), num_terms))
            

            # Scale and convert the TF-IDF vectors for train and test datasets
            for data in [X_train, X_test]:
                n_batches = int(np.ceil(len(data) / batch_size))
                scaled_csr_matrices = []

                for i in tqdm(range(n_batches), total=n_batches, desc='Scaling tfidf'):
                    start_idx = i * batch_size
                    end_idx = (i + 1) * batch_size
                    batch = data.iloc[start_idx:end_idx]
                    
                    # Convert the batch to a CSR matrix
                    batch_csr_matrix = gensim_to_csr(batch[feature], max_index + 1)

                    if data is X_train:
                        # Use toarray() as MinMaxScaler's partial_fit doesn't support sparse matrix
                        scaler.partial_fit(batch_csr_matrix.toarray())
                        scaled_batch = scaler.transform(batch_csr_matrix)
                    else:
                        scaled_batch = scaler.transform(batch_csr_matrix)

                    scaled_csr_matrices.append(scaled_batch)

                # Vertically stack the CSR matrices to get the final scaled matrix
                final_csr_matrix = vstack(scaled_csr_matrices)
                if data is X_train:
                    final_features_csr_train.append(final_csr_matrix)
                else:
                    final_features_csr_test.append(final_csr_matrix)
        else:
            final_features_csr_train.append(csr_matrix(scaler.fit_transform(X_train[feature].values.reshape(-1, 1))))
            final_features_csr_test.append(csr_matrix(scaler.transform(X_test[feature].values.reshape(-1, 1))))
            #X_test.loc[:, feature] = scaler.transform(X_test[feature].values.reshape(-1, 1))

    X_train = hstack(final_features_csr_train)
    X_test = hstack(final_features_csr_test)
    # Initialize the model
    if model == 'linear':
        reg = LinearRegression()
    elif model == 'randomforest':
        reg = RandomForestRegressor()
    elif model == 'svm':
        reg = SVR()
    
    # Train the model
    reg.fit(X_train, y_train)

    # Predict the ratings
    y_pred = reg.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Log the metrics
    #print(f"""MSE: {mse} \n
    #        MAE: {mae} \n
    #        R2: {r2}""")

    return mse, mae, r2

In [124]:
def cross_validate(procd_df, params):
    import numpy as np
    from sklearn.model_selection import KFold
    from tqdm import tqdm

    # Set features
    topic_columns = [column for column in procd_df.columns if 'Topic' in column]
    features = ['review_length', 'sentiment_score', 'tfidf'] + topic_columns
    #features = ['tfidf']
    #features = topic_columns

    # Set target
    target = 'rating'

    # Cross validation
    kf = KFold(n_splits=5, shuffle=True, random_state=params['RANDOM_SEED'])

    # Initialize metrics
    mses = []
    maes = []
    r2s = []

    for train_index, test_index in tqdm(kf.split(procd_df), total=kf.get_n_splits(), desc='Cross validation'):
        # Split the data
        train, test = procd_df.iloc[train_index], procd_df.iloc[test_index]
        X_train, X_test = train[features], test[features]
        y_train, y_test = train[target], test[target]

        # Predict the ratings
        fold_mses = []
        fold_maes = []
        fold_r2s = []
        for model in tqdm(['linear', 'randomforest', 'svm']):
            mse, mae, r2 = predict_rating(X_train, X_test, y_train, y_test, features, model, params)

            fold_mses.append({model: mse})
            fold_maes.append({model: mae})
            fold_r2s.append({model: r2})

        # Append the metrics
        mses.append(fold_mses)
        maes.append(fold_maes)
        r2s.append(fold_r2s)
    
    return mses, maes, r2s

In [118]:
# Load parameters from DVC
params = dvc.api.params_show()

# Set paths
procd_data_path = '../data/preprocessed/procd_train.csv'
tfidf_corpus_path = f"../data/features/{params['procd_text']}/{params['topic_modeling']['feature']}_{params['feature_engineering']['ngram']}_corpus.pkl"
topics_dist_path = f"../data/evaluate/topics_dist_train.csv"

bert_embeddings_path = f"../data/features/bert_embeddings.pkl"

# Load preprocessed data
procd_df = pd.read_csv(procd_data_path)
procd_df[params['procd_text']] = procd_df[params['procd_text']].apply(lambda x: eval(x))
procd_df = procd_df[procd_df[params['procd_text']].apply(lambda row: len(row) > 0)]

# Load tfidf corpus
tfidf_corpus = pickle.load(open(tfidf_corpus_path, 'rb'))

# Load Bert embeddings
#bert_embeddings = pickle.load(open(bert_embeddings_path, 'rb'))

# Load topic distributions
topics_dist = pd.read_csv(topics_dist_path)

In [119]:
procd_df_with_features = add_features(procd_df, tfidf_corpus, topics_dist)

In [157]:
sample_df = procd_df_with_features.sample(frac=0.03, random_state=params['RANDOM_SEED'])

mses, maes, r2s = cross_validate(sample_df, params)

Cross validation:   0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

Scaling tfidf:   0%|          | 0/4 [00:00<?, ?it/s]



Scaling tfidf:  25%|██▌       | 1/4 [00:02<00:08,  2.85s/it]

Scaling tfidf:  50%|█████     | 2/4 [00:05<00:05,  2.79s/it]

Scaling tfidf:  75%|███████▌  | 3/4 [00:08<00:02,  2.72s/it]

Scaling tfidf: 100%|██████████| 4/4 [00:10<00:00,  2.64s/it]


Scaling tfidf: 100%|██████████| 1/1 [00:00<00:00, 123.38it/s]

 33%|███▎      | 1/3 [00:19<00:38, 19.31s/it]

Scaling tfidf:   0%|          | 0/4 [00:00<?, ?it/s]

Scaling tfidf:  25%|██▌       | 1/4 [00:02<00:07,  2.51s/it]

Scaling tfidf:  50%|█████     | 2/4 [00:05<00:05,  2.52s/it]

Scaling tfidf:  75%|███████▌  | 3/4 [00:07<00:02,  2.54s/it]

Scaling tfidf: 100%|██████████| 4/4 [00:09<00:00,  2.47s/it]


Scaling tfidf: 100%|██████████| 1/1 [00:00<00:00, 124.95it/s]

 67%|██████▋   | 2/3 [08:53<05:10, 310.51s/it]

Scaling tfidf:   0%|          | 0/4 [00:00<?, ?it/s]

Scaling tfidf:  25%|██▌       | 1/4 [00:02<00:07,  2.51s/it]

Scaling tfidf:  50%|█████     | 2/4 [00:05<00:05,  2.55s/it]

Scaling tfidf:  75%|███████▌  | 3/4 [00:07<00:0

In [195]:
dfs = []
for name,metric in zip(['mse', 'mae', 'r'], [mses, maes, r2s]):
    df = pd.DataFrame(metric)
    df.columns = ['linear', 'randomforest', 'svm']

    for col in df.columns:
        df[col] = df[col].apply(lambda x: x.get(col, None))
    df = pd.melt(df, var_name='model', value_name=name)
    
    dfs.append(df)

In [200]:
lr_metric_df = dfs[0].copy()
lr_metric_df['mae'] = dfs[1]['mae']
lr_metric_df['r2'] = dfs[2]['r']

In [203]:
#lr_metric_df.to_csv('../data/evaluate/lr_metrics.csv', index=False)
lr_metric_df

Unnamed: 0,model,mse,mae,r2
0,linear,2593.356946,40.72612,-240.989061
1,linear,4386.943,48.639193,-396.784429
2,linear,2671.050219,40.229343,-252.678234
3,linear,4926.532751,52.324096,-479.534075
4,linear,2854.608776,42.09057,-274.781428
5,randomforest,8.710707,2.287634,0.187194
6,randomforest,10.071522,2.454514,0.086768
7,randomforest,9.707454,2.402975,0.078052
8,randomforest,9.013452,2.351364,0.120828
9,randomforest,9.174936,2.34242,0.113617
