In [5]:
import pandas as pd
from langdetect import detect
from textblob import TextBlob
from better_profanity import profanity
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

# Initialize better_profanity
profanity.load_censor_words()

# Sample data (replace with actual dataset)
data = {
    "reviews": [
        "This product is amazing!",
        "Absolutely horrible, do not buy.",
        "Great quality, but delivery was late.",
        "Terrible customer service. Never again.",
        "This is the best purchase I've made!",
        "Product was okay, but nothing special.",
        "Awful experience. Waste of money."
    ]
}
df = pd.DataFrame(data)

# Step 1: Language Detection
def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

df["language"] = df["reviews"].apply(detect_language)

# Step 2: Spelling Correction
def correct_spelling(text):
    return str(TextBlob(text).correct())

df["corrected_reviews"] = df["reviews"].apply(correct_spelling)

# Step 3: Profanity Detection
def contains_profanity(text):
    return profanity.contains_profanity(text)

df["contains_profanity"] = df["corrected_reviews"].apply(contains_profanity)

# Step 4: Sentiment Analysis
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity  # Polarity ranges from -1 (negative) to 1 (positive)

df["sentiment"] = df["corrected_reviews"].apply(get_sentiment)

# Step 5: Subjectivity Analysis
def get_subjectivity(text):
    analysis = TextBlob(text)
    return analysis.sentiment.subjectivity  # Subjectivity ranges from 0 (objective) to 1 (subjective)

df["subjectivity"] = df["corrected_reviews"].apply(get_subjectivity)

# Step 6: Preparing Data for Review Ranking
# Binary target: Positive sentiment >= 0.1, Negative < 0.1
df["label"] = df["sentiment"].apply(lambda x: 1 if x >= 0.1 else 0)

# Convert text to numerical features
vectorizer = CountVectorizer(max_features=500)
X = vectorizer.fit_transform(df["corrected_reviews"]).toarray()
y = df["label"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 7: Random Forest Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 8: Evaluate the Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Display processed DataFrame
print(df)


Model Accuracy: 100.00%
                                   reviews language  \
0                 This product is amazing!       en   
1         Absolutely horrible, do not buy.       en   
2    Great quality, but delivery was late.       en   
3  Terrible customer service. Never again.       en   
4     This is the best purchase I've made!       en   
5   Product was okay, but nothing special.       en   
6        Awful experience. Waste of money.       en   

                         corrected_reviews  contains_profanity  sentiment  \
0                  His product is amazing!               False   0.750000   
1         Absolutely horrible, do not buy.               False  -1.000000   
2    Great quality, but delivery was late.               False   0.250000   
3  Terrible customer service. Never again.               False  -1.000000   
4      His is the best purchase I've made!               False   1.000000   
5   Product was okay, but nothing special.               False   0.428571