In [None]:
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import joblib

In [None]:
def run_notebook(notebook_path):
    from IPython import get_ipython
    ipython = get_ipython()
    if ipython is None:
        raise RuntimeError("IPython is not available.")
    else:
        ipython.magic('run ' + notebook_path)

DATA LOADING....

In [None]:
cleaned_data_path = 'cleaned_reviews_nltk.csv'
if not os.path.exists(cleaned_data_path):
    print("Cleaned data not found. Running data_preparation.ipynb to generate cleaned data...")
    run_notebook('data_preparation.ipynb') 

In [None]:
df_cleaned = pd.read_csv(cleaned_data_path)
print("Cleaned Data Loaded:")
df_cleaned.head()

In [None]:
df_cleaned['Cleaned_Review'] = df_cleaned['Cleaned_Review'].fillna('')

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df_cleaned['Cleaned_Review'])


TF-IDF vectorizer for later use

In [None]:
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
print("TF-IDF Vectorizer saved as 'tfidf_vectorizer.pkl'")

In [None]:
scaler = StandardScaler(with_mean=False)  # with_mean=False....im using sparse matrix scaling
X_scaled = scaler.fit_transform(X)

scaler for later use

In [None]:
joblib.dump(scaler, 'scaler.pkl')
print("Scaler saved as 'scaler.pkl'")

In [None]:
X_df_scaled = pd.DataFrame(X_scaled.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
df_features = pd.concat([df_cleaned[['Reviewer Name', 'Rating']], X_df_scaled], axis=1)
print("\nFeature Data Sample:")
df_features.head()

In [None]:
df_features.to_csv('feature_data.csv', index=False)
print("Feature data saved as 'feature_data.csv'")
df_saved = pd.read_csv('feature_data.csv')
print("\nFeatures Data Loaded from CSV:")
df_saved.head()