In [1]:
#import the necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy.sparse import hstack
import os

# Load datasets
train_path = "https://raw.githubusercontent.com/psabhay2003/Beauty-Products-Review-Rating/refs/heads/main/Train.csv"
test_path = "https://raw.githubusercontent.com/psabhay2003/Beauty-Products-Review-Rating/refs/heads/main/Test.csv"
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Handle missing values
train_df.fillna("", inplace=True)
test_df.fillna("", inplace=True)

# Ensure 'rating' is numeric
train_df["rating"] = pd.to_numeric(train_df["rating"], errors="coerce")
train_df.dropna(subset=["rating"], inplace=True)  # Remove rows with NaN ratings
y_train = train_df["rating"].astype(int)  # Ensure it's an integer

# Encode categorical features using Label Encoding
categorical_cols = ["asin", "parent_asin", "verified_purchase"]
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = test_df[col].map(lambda s: le.transform([s])[0] if s in le.classes_ else -1)  # Handle unseen labels


# Text feature extraction using TF-IDF (Reduce Features to Save Memory)
tfidf = TfidfVectorizer(max_features=500, ngram_range=(1,1))  # Reduce to 500 features
X_train_text = tfidf.fit_transform(train_df["title"] + " " + train_df["text"])
X_test_text = tfidf.transform(test_df["title"] + " " + test_df["text"])

# Combine text features with categorical features (Keep Sparse Matrices)
X_train = hstack((X_train_text, train_df[categorical_cols].values))
X_test = hstack((X_test_text, test_df[categorical_cols].values))

# Train Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict ratings for the test dataset
test_predictions = lr_model.predict(X_test)

# Round predictions to nearest valid rating (1-5)
test_predictions = np.round(test_predictions).clip(1, 5)

# Create the directory if it doesn't exist
os.makedirs("/mnt/data", exist_ok=True)

# Create submission file
submission = pd.DataFrame({"Id": range(len(test_predictions)), "Predicted_Rating": test_predictions})
submission.to_csv("/mnt/data/submission.csv", index=False)

print("Submission file created successfully!")

  test_df.fillna("", inplace=True)


Submission file created successfully!


In [3]:
from google.colab import files
files.download("/mnt/data/submission.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [2]:
from sklearn.model_selection import KFold, cross_val_score

# Create a KFold object with 5 splits
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Compute negative MSE scores (we'll convert them to RMSE)
neg_mse_scores = cross_val_score(lr_model, X_train, y_train, scoring="neg_mean_squared_error", cv=cv)
rmse_scores = np.sqrt(-neg_mse_scores)

print("Cross-Validation RMSE Scores:", rmse_scores)
print("Mean RMSE: {:.4f}".format(rmse_scores.mean()))
print("Standard Deviation of RMSE: {:.4f}".format(rmse_scores.std()))


Cross-Validation RMSE Scores: [0.9530652  0.96254172 0.97441877 0.98056392 0.95354783]
Mean RMSE: 0.9648
Standard Deviation of RMSE: 0.0110
