In [5]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
import pickle
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
nltk.download('wordnet')


# Load your dataset
df = pd.read_csv(r'C:\Amazon_Review_Sentimental_Analysis\data\modified_reviews.csv')

# Preprocessing
stemmer = PorterStemmer()
STOPWORDS = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)  # Remove special characters
    text = text.lower().split()  # Convert to lowercase and split
    text = [stemmer.stem(word) for word in text if word not in STOPWORDS]  # Stemming and stopwords removal
    return ' '.join(text)

# Preprocess review descriptions
df['processed_review'] = df['review_discription'].apply(preprocess_text)

# Drop unnecessary columns
df.drop(['Product_name', 'review_title', 'review_discription'], axis=1, inplace=True)

# Convert ratings to sentiments (assumption: rating > 3 positive, 3 neutral, <3 negative)
def label_sentiment(rating):
    if rating > 3:
        return 1  # Positive
    elif rating == 3:
        return 0  # Neutral
    else:
        return -1  # Negative

df['sentiment'] = df['rating'].apply(label_sentiment)

# Prepare features and labels
X = df['processed_review']
y = df['sentiment']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
cv = CountVectorizer(max_features=2500)
X_train_cv = cv.fit_transform(X_train).toarray()
X_test_cv = cv.transform(X_test).toarray()

# Scale the data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_cv)
X_test_scaled = scaler.transform(X_test_cv)

# Train a Random Forest Classifier
model_rf = RandomForestClassifier()
model_rf.fit(X_train_scaled, y_train)

# Create the directory if it doesn't exist
os.makedirs('C:/Amazon_Review_Sentimental_Analysis/model_buildings/Models', exist_ok=True)

# Save the models
with open(r'C:/Amazon_Review_Sentimental_Analysis/model_buildings/Models/vectorizer.pkl', 'wb') as f:
    pickle.dump(cv, f)

with open(r'C:/Amazon_Review_Sentimental_Analysis/model_buildings/Models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open(r'C:/Amazon_Review_Sentimental_Analysis/model_buildings/Models/model_rf.pkl', 'wb') as f:
    pickle.dump(model_rf, f)

# Evaluate the model
y_pred = model_rf.predict(X_test_scaled)
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package wordnet to C:\Users\IQRA
[nltk_data]     SHAIKH/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


              precision    recall  f1-score   support

          -1       0.72      0.73      0.72       174
           0       0.42      0.09      0.15        53
           1       0.82      0.90      0.86       387

    accuracy                           0.79       614
   macro avg       0.65      0.58      0.58       614
weighted avg       0.76      0.79      0.76       614



In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
import pickle
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Load your dataset
df = pd.read_csv(r'C:\Amazon_Review_Sentimental_Analysis\data\modified_reviews.csv')

# Preprocessing
stemmer = PorterStemmer()
STOPWORDS = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)  # Remove special characters
    text = text.lower().split()  # Convert to lowercase and split
    text = [stemmer.stem(word) for word in text if word not in STOPWORDS]  # Stemming and stopwords removal
    return ' '.join(text)

# Preprocess review descriptions (using only review text for sentiment analysis)
df['processed_review'] = df['review_discription'].apply(preprocess_text)

# Prepare features (X) and target labels (y), assuming sentiment has to be extracted from reviews
X = df['processed_review']

# Sentiment labeling using review text only (assuming rating is removed, using custom labels)
# Assuming reviews with specific keywords for sentiments (adjust if needed):
def label_sentiment(review):
    # Simplified rule-based sentiment extraction (can replace with more complex logic)
    if any(word in review for word in ['good', 'great', 'excellent', 'love']):
        return 1  # Positive
    elif any(word in review for word in ['bad', 'poor', 'worst', 'dislike']):
        return -1  # Negative
    else:
        return 0  # Neutral

df['sentiment'] = df['processed_review'].apply(label_sentiment)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)

# Vectorize the text data
cv = CountVectorizer(max_features=2500)
X_train_cv = cv.fit_transform(X_train).toarray()
X_test_cv = cv.transform(X_test).toarray()

# Scale the data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_cv)
X_test_scaled = scaler.transform(X_test_cv)

# Train a Random Forest Classifier
model_rf = RandomForestClassifier()
model_rf.fit(X_train_scaled, y_train)

# Create the directory if it doesn't exist
os.makedirs('C:/Amazon_Review_Sentimental_Analysis/model_buildings/Models', exist_ok=True)

# Save the models
with open(r'C:/Amazon_Review_Sentimental_Analysis/model_buildings/Models/vectorizer.pkl', 'wb') as f:
    pickle.dump(cv, f)

with open(r'C:/Amazon_Review_Sentimental_Analysis/model_buildings/Models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open(r'C:/Amazon_Review_Sentimental_Analysis/model_buildings/Models/model_rf.pkl', 'wb') as f:
    pickle.dump(model_rf, f)

# Evaluate the model
y_pred = model_rf.predict(X_test_scaled)
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to C:\Users\IQRA
[nltk_data]     SHAIKH/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\IQRA
[nltk_data]     SHAIKH/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


              precision    recall  f1-score   support

          -1       1.00      0.94      0.97        36
           0       0.99      1.00      0.99       356
           1       1.00      0.99      0.99       222

    accuracy                           0.99       614
   macro avg       0.99      0.98      0.99       614
weighted avg       0.99      0.99      0.99       614

