In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from textblob import TextBlob

# Load the cleaned dataset
data = pd.read_csv('../data/cleaned-amazon-reviews.csv')

# Verify data loading
print(data.head())

# Verify required columns
print(data.columns)

# Sentiment Analysis
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

# Test sentiment analysis on a sample text
sample_text = data['Text'].iloc[0]
print(sample_text)
analysis = TextBlob(sample_text)
print(analysis.sentiment.polarity)

# Add sentiment score to the dataset
data['Sentiment'] = data['Text'].apply(get_sentiment)

# Check the first few rows to verify sentiment scores
print(data[['Text', 'Sentiment']].head())

# Prepare data for modeling
X = data[['HelpfulnessNumerator', 'HelpfulnessDenominator', 'Sentiment']]
y = data['Score']

# Check for any missing values
print(X.isnull().sum())
print(y.isnull().sum())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Print the classification report and confusion matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1 