### Standardize data

In [9]:
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def get_tfidf_vectorizer(df_in, text_col,  minperc=0.005, maxperc=0.995):

    '''
    Fits a TF-IDF vectorizer on the specified text column and returns the vectorizer.
    Considers words with occurence percentage in the range [minperc, maxperc]
    '''

    # TF-IDF vectorization
    vectorizer = TfidfVectorizer(min_df=minperc, max_df=maxperc)
    vectorizer.fit(df_in[text_col])

    return vectorizer



def get_tfidf_features(df_in, text_col, vectorizer):

    '''
    Returns dataframe with tf-idf features created from the 'text_col'
    using the provided vectorizer.
    '''

    # Use the passed vectorizer to transform the text column
    X = vectorizer.transform(df_in[text_col])

    # Convert the sparse matrix to a DataFrame
    col_names = ["f_" + wordcol for wordcol in vectorizer.get_feature_names_out()]
    tfidf_df = pd.DataFrame(X.toarray(), columns=col_names, index=df_in.index)

    # Merge the TF-IDF features with the original DataFrame using the index
    df_out = df_in.merge(tfidf_df, left_index=True, right_index=True)

    return df_out

In [3]:
# read train-val-test splits
reviews_train = pd.read_pickle("./data/processed_reviews_train_v1.pkl")
reviews_val = pd.read_pickle("./data/processed_reviews_val_v1.pkl")
reviews_test = pd.read_pickle("./data/processed_reviews_test_v1.pkl")

# Load the vectorizer from the pickle file
with open('./artifacts/eda/tfidf_vectorizer.pkl', 'rb') as f:
    tfidf_vect = pickle.load(f)

In [4]:
reviews_train_wt_tfidf = get_tfidf_features(reviews_train, 'cleaned_text', tfidf_vect)
reviews_test_wt_tfidf = get_tfidf_features(reviews_test, 'cleaned_text', tfidf_vect)
reviews_val_wt_tfidf = get_tfidf_features(reviews_val, 'cleaned_text', tfidf_vect)

In [8]:
tfidf_cols = ["f_" + wordcol for wordcol in tfidf_vect.get_feature_names_out()]
feature_cols = tfidf_cols + ['rating']

In [48]:
data = {
    'feature1': np.random.randint(0, 100, 10),
    'feature2': np.random.randint(0, 200, 10),
    'feature3': np.random.randint(0, 300, 10)
}

df = pd.DataFrame(data)

In [57]:
# Randomize the rows
df_randomized = df.sample(frac=1, random_state=72)

df_randomized

Unnamed: 0,feature1,feature2,feature3
0,56,127,282
4,18,73,247
1,57,198,229
9,32,70,58
7,64,132,56
5,38,146,193
2,41,116,299
6,23,65,229
3,32,198,126
8,69,148,50


In [42]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Features to standardize
stdize_cols = ['feature2', 'feature3']
other_cols = [col for col in df_randomized.columns if col not in stdize_cols]

# Fit the scaler on the DataFrame and transform the data
scaled_data = scaler.fit_transform(df_randomized[stdize_cols])

# Convert the scaled data back into a DataFrame
scaled_df = pd.DataFrame(scaled_data, columns=stdize_cols, index=df_randomized.index)
scaled_df

# Combine original features with scaled features
final_df = pd.concat([df_randomized[other_cols], scaled_df], axis=1)

final_df

Unnamed: 0,feature1,feature2,feature3
4,53,0.877428,1.747336
7,18,0.166,-0.026609
5,3,0.189714,-0.662273
1,54,1.28057,0.697752
3,75,-2.371427,0.446443
9,62,0.355714,-1.312719
6,46,-0.142286,0.357746
0,39,0.829999,-1.076193
8,37,-1.019713,-1.268371
2,23,-0.166,1.096889
