In [12]:
import boto3
import pandas as pd
import joblib
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [13]:
# Load & Clean the data
bucket_name = 'mybixybucket'
object_key = 'plugin_engagement_data.csv'

In [14]:
# Create a boto3 client
s3 = boto3.client('s3')

# Load object from S3
csv_obj = s3.get_object(Bucket=bucket_name, Key=object_key)
body = csv_obj['Body'].read().decode('utf-8')

In [15]:
# Use StringIO to convert to pandas-readable format
df = pd.read_csv(StringIO(body))


# Display the first few rows
df.head()

Unnamed: 0,plugin_type,hover_duration,scroll_depth,clicks,lightbox_triggered,conversion,engaged
0,poll_widget,1.41,26.66,3,0,0,0
1,feedback_form,9.03,58.77,4,1,0,1
2,video_embed,3.95,88.57,1,1,0,0
3,poll_widget,2.74,75.9,2,1,0,0
4,feedback_form,0.51,82.59,4,0,0,1


In [16]:
# Encode & Split
# 1) Show what columns you actually have
print("Columns in df:", df.columns.tolist())

Columns in df: ['plugin_type', 'hover_duration', 'scroll_depth', 'clicks', 'lightbox_triggered', 'conversion', 'engaged']


In [17]:
# 2) Only encode if plugin_type is present
if 'plugin_type' in df.columns:
    df = pd.get_dummies(df, columns=['plugin_type'], drop_first=True)
    print("Encoded 'plugin_type' into:",
          [c for c in df.columns if c.startswith('plugin_type_')])
else:
    print("Skipping encoding—'plugin_type' not found (already encoded or named differently).")

Encoded 'plugin_type' into: ['plugin_type_feedback_form', 'plugin_type_poll_widget', 'plugin_type_quiz_popup', 'plugin_type_video_embed']


In [18]:
# 3) Split into X and y
X = df.drop('engaged', axis=1)
y = df['engaged']

# 4) Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42
)

In [19]:
# 5) Print shapes and label distribution 
print("\n🔧 AFTER ENCODING & SPLIT") 
print("Features:", X.columns.tolist()) 
print("Training set shape:", X_train.shape) 
print("Test set shape:    ", X_test.shape) 
print("Training labels:\n", y_train.value_counts())


🔧 AFTER ENCODING & SPLIT
Features: ['hover_duration', 'scroll_depth', 'clicks', 'lightbox_triggered', 'conversion', 'plugin_type_feedback_form', 'plugin_type_poll_widget', 'plugin_type_quiz_popup', 'plugin_type_video_embed']
Training set shape: (700, 9)
Test set shape:     (300, 9)
Training labels:
 engaged
0    388
1    312
Name: count, dtype: int64


In [20]:
# -----------------------------
# Section 3: Train, Save & Verify
# -----------------------------

# 1) Train your model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

In [21]:
# 2) Check test accuracy
acc = model.score(X_test, y_test)
print(f"Test accuracy: {acc:.3f}")

Test accuracy: 0.890


In [22]:
# 3) Save model locally
local_path = 'model.joblib'
joblib.dump(model, local_path)
print(f" Model serialized to {local_path}")

 Model serialized to model.joblib


In [23]:
# 4) Upload to S3 with error handling
s3 = boto3.client('s3')

BUCKET = 'mybixybucket'
KEY = 'model/model.joblib'

try:
    s3.upload_file(local_path, BUCKET, KEY)
    print(f" Uploaded to s3://{BUCKET}/{KEY}")
except Exception as e:
    print(" Upload failed:", e)

 Uploaded to s3://mybixybucket/model/model.joblib


In [24]:
# 5) Verify by listing the 'model/' prefix
print("\nVerifying bucket contents under 'model/':")
resp = s3.list_objects_v2(Bucket=BUCKET, Prefix='model/')

if 'Contents' in resp:
    for obj in resp['Contents']:
        print(" •", obj['Key'], f"({obj['Size']} bytes)")
else:
    print(" • No objects found under 'model/'")


Verifying bucket contents under 'model/':
 • model/model.joblib (1375 bytes)
