#Steps

You need to download the data from https://www.kaggle.com/datasets/arun9872/amazon-customer-reviews-for-mobile-phones-in-uk

You may also find some other similar data to use the same logic and practice more

1 - Download the data and put it in your local or google drive
2 - Run the model on google colab or jupiter on your local

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer, StandardScaler, MaxAbsScaler, Normalizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import string
from sklearn.linear_model import LogisticRegression, LinearRegression

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



from google.colab import drive
drive.mount("/content/drive")

# Read in the CSV file
df = pd.read_csv("/content/drive/My Drive/ML Lessons/Amazon_Unlocked_Mobile.csv")
df.drop(labels=["Product Name", "Brand Name", "Price"], axis=1, inplace=True)

# Remove rows with missing data
df.dropna(subset=["Reviews"], inplace=True)

# Visualize the distribution of ratings
sns.histplot(df["Rating"], bins=[0.5, 1.5, 2.5, 3.5, 4.5, 5.5], stat="probability")
plt.show()

# Define a function to preprocess the text
def clean_text(text):
    if isinstance(text, str):
        # Convert to lowercase and remove punctuation
        text = text.lower().translate(str.maketrans("", "", string.punctuation))
    return text

# Apply the clean_text function to the "Reviews" column
df["Reviews"] = df["Reviews"].apply(clean_text)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["Reviews"], df["Rating"], test_size=0.2, random_state=42)

# Create a pipeline to preprocess the text and fit a Naive Bayes classifier
# you can try different vectorizer
# Different vectorizer or classifier change the prediction score 
pipeline = Pipeline([
    ("clean_text", FunctionTransformer(clean_text)),
    ("vectorizer", CountVectorizer()), # Prediction accuracy: 0.7172531296824399
    #("vectorizer", TfidfVectorizer()), # Prediction accuracy: 0.7085528541737155
    ("scaler", StandardScaler(with_mean=False)),
    ("maxabs", MaxAbsScaler()),
    ("normalize", Normalizer()),
    ("poly", PolynomialFeatures(degree=1)),
    #("classifier", MultinomialNB()), # # Prediction accuracy: 0.7172531296824399
    #("classifier", LogisticRegression()), # Prediction accuracy: 0.7880878727826381
    #("classifier", DecisionTreeClassifier(random_state=42)), #Prediction accuracy: 0.8754289719174441
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42)), #Prediction accuracy: 0.8950529266760114

    
    
])

pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Convert the "Rating" column to labels
y_test_labels = pd.cut(y_test, bins=[0, 1, 2.9, 4, 5], labels=["Bad", "Average", "Good", "Excellent"])

# Convert the predicted ratings to labels
y_pred_labels = pd.cut(y_pred, bins=[0, 1, 2.9, 4, 5], labels=["Bad", "Average", "Good", "Excellent"])

# Calculate the prediction accuracy
accuracy = accuracy_score(y_test_labels, y_pred_labels)

# Plot the predicted vs actual ratings
labels = ["Bad", "Average", "Good", "Excellent"]
test_counts = [sum(y_test_labels == label) for label in labels]
pred_counts = [sum(y_pred_labels == label) for label in labels]
bar_width = 0.4
x1 = [i - bar_width/2 for i in range(len(labels))]
x2 = [i + bar_width/2 for i in range(len(labels))]
plt.bar(x1, test_counts, width=bar_width, label="Actual")
plt.bar(x2, pred_counts, width=bar_width, label="Predicted")
plt.xticks(range(len(labels)), labels)
plt.xlabel("Rating")
plt.ylabel("Count")
plt.legend()
plt.show()

# Print the prediction accuracy
print("Prediction accuracy:", accuracy)


# Exporting the model
from joblib import dump

# Save the pipeline as a file
dump(pipeline, "/content/model.joblib")

# Sending Request to the Model

In [None]:
from joblib import load

# Load the pipeline from the file
pipeline = load("/content/model.joblib")

# sending prediction 

# Example input text
input_text = "This is a normal phone!"

# Clean the text using the clean_text function
cleaned_text = clean_text(input_text)

# Make a prediction on the cleaned text
prediction = pipeline.predict([cleaned_text])

# Convert the predicted rating to a label
predicted_label = pd.cut(prediction, bins=[0, 1, 2.9, 4, 5], labels=["Bad", "Average", "Good", "Excellent"])

print("Input text:", input_text)
print("Predicted rating:", prediction)
print("Predicted label:", predicted_label)
