In [3]:
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import joblib

In [4]:
def run_notebook(notebook_path):
    from IPython import get_ipython
    ipython = get_ipython()
    if ipython is None:
        raise RuntimeError("IPython is not available.")
    else:
        ipython.magic('run ' + notebook_path)

In [5]:
cleaned_data_path = 'cleaned_reviews_nltk.csv'
if not os.path.exists(cleaned_data_path):
    print("Cleaned data not found. Running data_preparation.ipynb to generate cleaned data...")
    run_notebook('data_preparation.ipynb') 

In [6]:
df_cleaned = pd.read_csv(cleaned_data_path)
print("Cleaned Data Loaded:")
df_cleaned.head()

Cleaned Data Loaded:


Unnamed: 0,Reviewer Name,Rating,Review Text,Cleaned_Review
0,Eugene ath,1.0,"I registered on the website, tried to order a ...","I registered on the website, tried to order a ..."
1,Daniel ohalloran,1.0,Had multiple orders one turned up and driver h...,Had multiple orders one turned up and driver h...
2,p fisher,1.0,I informed these reprobates that I WOULD NOT B...,I informed these reprobates that I WOULD NOT B...
3,Greg Dunn,1.0,I have bought from Amazon before and no proble...,I have bought from Amazon before and no proble...
4,Sheila Hannah,1.0,If I could give a lower rate I would! I cancel...,If I could give a lower rate I would! I cancel...


In [7]:
df_cleaned['Cleaned_Review'] = df_cleaned['Cleaned_Review'].fillna('')

In [8]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df_cleaned['Cleaned_Review'])


TF-IDF vectorizer for later use

In [9]:
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
print("TF-IDF Vectorizer saved as 'tfidf_vectorizer.pkl'")

TF-IDF Vectorizer saved as 'tfidf_vectorizer.pkl'


In [10]:
scaler = StandardScaler(with_mean=False)  # with_mean=False....im using sparse matrix scaling
X_scaled = scaler.fit_transform(X)

scaler for later use

In [11]:
joblib.dump(scaler, 'scaler.pkl')
print("Scaler saved as 'scaler.pkl'")

Scaler saved as 'scaler.pkl'


In [12]:
X_df_scaled = pd.DataFrame(X_scaled.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
df_features = pd.concat([df_cleaned[['Reviewer Name', 'Rating']], X_df_scaled], axis=1)
print("\nFeature Data Sample:")
df_features.head()


Feature Data Sample:


Unnamed: 0,Reviewer Name,Rating,00,000,00pm,01,02,03,04,05,...,yourselves,youtube,yr,yrs,zero,zone,𝐒𝐮𝐩𝐩𝐨𝐫𝐭_,𝒻𝓇𝑒𝑒,𝕙𝕖𝕝𝕡,𝕟𝕦𝕞𝕓𝕖𝕣
0,Eugene ath,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Daniel ohalloran,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,p fisher,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Greg Dunn,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Sheila Hannah,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df_features.to_csv('feature_data.csv', index=False)
print("Feature data saved as 'feature_data.csv'")
df_saved = pd.read_csv('feature_data.csv')
print("\nFeatures Data Loaded from CSV:")
df_saved.head()

Feature data saved as 'feature_data.csv'

Features Data Loaded from CSV:


Unnamed: 0,Reviewer Name,Rating,00,000,00pm,01,02,03,04,05,...,yourselves,youtube,yr,yrs,zero,zone,𝐒𝐮𝐩𝐩𝐨𝐫𝐭_,𝒻𝓇𝑒𝑒,𝕙𝕖𝕝𝕡,𝕟𝕦𝕞𝕓𝕖𝕣
0,Eugene ath,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Daniel ohalloran,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,p fisher,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Greg Dunn,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Sheila Hannah,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
