In [10]:
import boto3
import pandas as pd
import io
import os

# Configure S3 details
s3_bucket = 'pyverseai'  # Replace with your S3 bucket name
file_key = 'dataset/Reviews.csv'  # Replace with your file key in S3

# Initialize boto3 S3 client
s3 = boto3.client('s3')

# Function to load dataset from S3
def load_dataset_from_s3(bucket_name, file_key):
    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    data = pd.read_csv(io.BytesIO(obj['Body'].read()))
    return data

# Load dataset
dataset = load_dataset_from_s3(s3_bucket, file_key)

# Display the first few rows of the dataset
print(dataset.head())

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1 

In [11]:
# Drop rows with missing values
dataset = dataset.dropna(subset=['Text', 'Score'])

# Keep only the relevant columns
dataset = dataset[['Text', 'Score']]

# Create a new column 'Sentiment' based on the rating
# For simplicity, we'll assume Score > 3 is positive, 3 is neutral, and < 3 is negative
def sentiment_label(score):
    if score > 3:
        return 'positive'
    elif score < 3:
        return 'negative'
    else:
        return 'neutral'

dataset['Sentiment'] = dataset['Score'].apply(sentiment_label)

# Display the updated dataset
print(dataset.head())

                                                Text  Score Sentiment
0  I have bought several of the Vitality canned d...      5  positive
1  Product arrived labeled as Jumbo Salted Peanut...      1  negative
2  This is a confection that has been around a fe...      4  positive
3  If you are looking for the secret ingredient i...      2  negative
4  Great taffy at a great price.  There was a wid...      5  positive


In [12]:
import re
import string
from nltk.corpus import stopwords

# Download stopwords if not already present
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to clean text
def clean_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation and special characters
    text = re.sub(f"[{string.punctuation}]", " ", text)
    
    # Remove stop words
    text = " ".join([word for word in text.split() if word not in stop_words])
    
    return text

# Apply text cleaning to the 'Text' column
dataset['Cleaned_Text'] = dataset['Text'].apply(clean_text)

# Display the cleaned dataset
print(dataset[['Text', 'Cleaned_Text', 'Sentiment']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                Text  \
0  I have bought several of the Vitality canned d...   
1  Product arrived labeled as Jumbo Salted Peanut...   
2  This is a confection that has been around a fe...   
3  If you are looking for the secret ingredient i...   
4  Great taffy at a great price.  There was a wid...   

                                        Cleaned_Text Sentiment  
0  bought several vitality canned dog food produc...  positive  
1  product arrived labeled jumbo salted peanuts p...  negative  
2  confection around centuries light pillowy citr...  positive  
3  looking secret ingredient robitussin believe f...  negative  
4  great taffy great price wide assortment yummy ...  positive  


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features as needed

# Fit and transform the cleaned text data
X = vectorizer.fit_transform(dataset['Cleaned_Text'])

# Display the shape of the transformed data
print("Shape of TF-IDF matrix:", X.shape)

Shape of TF-IDF matrix: (568454, 5000)


In [14]:
from sklearn.model_selection import train_test_split

# Define the target variable (Sentiment) and features (TF-IDF matrix)
y = dataset['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of the training and testing sets
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


Training set shape: (454763, 5000)
Testing set shape: (113691, 5000)


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000)  # max_iter can be adjusted based on the dataset size

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)

# Print a classification report
print(classification_report(y_test, y_pred))

Model Accuracy: 0.8677995619706045
              precision    recall  f1-score   support

    negative       0.74      0.68      0.71     16181
     neutral       0.51      0.18      0.27      8485
    positive       0.90      0.97      0.93     89025

    accuracy                           0.87    113691
   macro avg       0.71      0.61      0.64    113691
weighted avg       0.85      0.87      0.85    113691



In [16]:

import joblib
import tarfile

# Create a directory to save the model and vectorizer
os.makedirs('model_dir', exist_ok=True)

# Save the trained model
joblib.dump(model, 'model_dir/sentiment_model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, 'model_dir/tfidf_vectorizer.pkl')

print("Model and vectorizer saved in 'model_dir'.")

Model and vectorizer saved in 'model_dir'.


In [17]:
# Create a tar.gz file for the model and vectorizer
with tarfile.open('sentiment_model.tar.gz', mode='w:gz') as archive:
    archive.add('model_dir/sentiment_model.pkl', arcname='sentiment_model.pkl')
    archive.add('model_dir/tfidf_vectorizer.pkl', arcname='tfidf_vectorizer.pkl')

# Define the S3 bucket and file paths
model_tar_path = 'sentiment_model.tar.gz'
s3_model_path = 'dynamic-ml-orchestration/models/sentiment_model.tar.gz'

# Upload the tar.gz file to S3
s3.upload_file(model_tar_path, s3_bucket, s3_model_path)

print(f"Model uploaded to: s3://{s3_bucket}/{s3_model_path}")

Model uploaded to: s3://pyverseai/dynamic-ml-orchestration/models/sentiment_model.tar.gz
