In [34]:
import pandas as pd
import numpy as np
import joblib
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

nltk.download('punkt')
nltk.download('stopwords')

data = pd.read_csv("input/vendor_reviews.csv")

def preprocess_text(text):
    tokens = text.lower().split()
    tokens = [word for word in tokens if word.isalnum()]  # Remove punctuation
    tokens = [word for word in tokens if word not in stopwords.words("english")]  # Remove stopwords
    return " ".join(tokens)

data["clean_review"] = data["reviewText"].apply(preprocess_text)

vectorizer = TfidfVectorizer(max_features=100)  # Extract 100 important words
X_text = vectorizer.fit_transform(data["clean_review"]).toarray()

scaler = MinMaxScaler()
data["normalized_rating"] = scaler.fit_transform(data[["rating"]])

X = np.hstack((X_text, data[["normalized_rating"]].values))
y = data["normalized_rating"]  # Target variable (normalized score)

model = RandomForestRegressor(n_estimators=100)
model.fit(X, y)

joblib.dump(model, "vendor_ranking_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

print("Model trained and saved successfully!")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model trained and saved successfully!
