<a href="https://colab.research.google.com/github/solomonmawira-dotcom/Diabetes_Patients_Prediction.ipynb/blob/main/digital_marketing_data_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Smart Ad Conversion Predictor
#AI Engineering Beginner Series_Final Project (Classification)
#Purpose:Predict whether a digital ad campaign will convert (binary classification).
#This notebook demonstrates full pipeline: EDA, preprocessing, modeling (logistic regression + random forest), evaluation, export for deployment, and pointers for Streamlit deployment.

#Dataset:`digital_marketing_data.csv` (expected columns include clicks, impressions, spent, cpc, ctr, platform/ad_type, converted/Converted)


In [None]:
# Run this cell in Colab to ensure required libraries are present
!pip install -q scikit-learn pandas matplotlib seaborn joblib
print("Installed/checked basic dependencies.")


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve)

sns.set(style="whitegrid", palette="muted")
print("Imports ready.")


In [None]:
# Colab-friendly: this code tries Kaggle path, local filename, then opens upload dialog in Colab.
DATA_KAGGLE_PATH = '/kaggle/input/predict-conversion-in-digital-marketing-dataset/digital_marketing_data.csv'
LOCAL_NAME = 'digital_marketing_data.csv'

if os.path.exists(DATA_KAGGLE_PATH):
    print("Loading dataset from Kaggle path.")
    df = pd.read_csv(DATA_KAGGLE_PATH)
elif os.path.exists(LOCAL_NAME):
    print("Loading dataset from local file:", LOCAL_NAME)
    df = pd.read_csv(LOCAL_NAME)
else:
    # Try Colab upload
    try:
        from google.colab import files
        print("Please upload the dataset file (digital_marketing_data.csv) via the dialog.")
        uploaded = files.upload()
        # pick first uploaded file
        uploaded_filename = list(uploaded.keys())[0]
        df = pd.read_csv(uploaded_filename)
    except Exception as e:
        raise FileNotFoundError("Dataset not found. Place 'digital_marketing_data.csv' in working directory, or upload it in Colab.")

print("Loaded dataframe with shape:", df.shape)
df.head()


In [None]:
# Basic EDA: info, describe, missing values
print("=== Data Info ===")
display(df.info())

print("\n=== Numeric Summary ===")
display(df.describe())

print("\n=== Missing values per column ===")
display(df.isnull().sum().sort_values(ascending=False).head(20))


In [None]:
# Try to detect a target column automatically; common names: Converted, converted, target, is_converted
possible_targets = ['converted','Converted','target','is_converted','Converted_flag','converted_flag']
target_col = None
for c in df.columns:
    if c in possible_targets or c.lower() in [p.lower() for p in possible_targets]:
        target_col = c
        break

# fallback: search for binary-like columns
if not target_col:
    for c in df.columns:
        if df[c].dropna().nunique() <= 2:
            target_col = c
            break

print("Detected target column:", target_col)
if target_col is not None:
    print(df[target_col].value_counts(dropna=False))
else:
    print("No target detected automatically. Edit the notebook to set target_col = 'YourColumnName'")


In [None]:
# --- USER ACTION: if target_col detected above is None, set it manually here:
# target_col = 'Converted'
# -----------------------------------------
if 'target_col' not in globals() or target_col is None:
    raise ValueError("Target column not set. Edit the notebook and set `target_col = 'Converted'` to your dataset's target column name.")

# Work on a copy
data = df.copy()

# 1) Drop rows where target is missing
data = data.dropna(subset=[target_col])

# 2) Identify numeric and categorical columns
num_cols = data.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
if target_col in num_cols:
    num_cols.remove(target_col)

print("Numerical columns (sample):", num_cols[:10])
print("Categorical columns (sample):", cat_cols[:10])

# 3) Drop simple id-like columns
id_cols = [c for c in data.columns if 'id' in c.lower() and c.lower()!=target_col.lower()]
print("Dropping id-like columns:", id_cols)
data = data.drop(columns=id_cols, errors='ignore')

# 4) Fill missing numeric with median and categorical with 'Unknown'
for c in num_cols:
    if data[c].isnull().sum() > 0:
        data[c] = data[c].fillna(data[c].median())
for c in cat_cols:
    if data[c].isnull().sum() > 0:
        data[c] = data[c].fillna('Unknown')

# 5) Encode categorical features quickly using LabelEncoder (fast for baseline)
le = LabelEncoder()
for c in cat_cols:
    try:
        data[c] = le.fit_transform(data[c].astype(str))
    except Exception as e:
        print("Encode error:", c, e)

# 6) Normalize/convert target to 0/1 if needed
if set(data[target_col].dropna().unique()) - {0,1}:
    if pd.api.types.is_numeric_dtype(data[target_col]):
        data[target_col] = (data[target_col] > data[target_col].median()).astype(int)
    else:
        # map common true-like values
        mapping = {}
        for val in data[target_col].unique():
            sval = str(val).strip().lower()
            if sval in ['1','true','yes','converted','y']:
                mapping[val] = 1
            elif sval in ['0','false','no','n']:
                mapping[val] = 0
        if mapping:
            data[target_col] = data[target_col].map(mapping).fillna(0).astype(int)
        else:
            data[target_col] = le.fit_transform(data[target_col].astype(str))

print("Preprocessing complete. Shape:", data.shape)


In [None]:
# Features and target
X = data.drop(columns=[target_col])
y = data[target_col].astype(int)

# Split (stratify if possible)
if y.nunique() > 1:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression baseline
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)

y_pred_lr = lr.predict(X_test_scaled)
y_proba_lr = lr.predict_proba(X_test_scaled)[:,1]

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Logistic Regression ROC AUC:", roc_auc_score(y_test, y_proba_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))


In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred_lr)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix (Logistic Regression)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ROC curve
fpr, tpr, _ = roc_curve(y_test, y_proba_lr)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label=f'LR (AUC = {roc_auc_score(y_test, y_proba_lr):.3f})')
plt.plot([0,1], [0,1], '--', color='grey')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Logistic Regression)')
plt.legend()
plt.show()


In [None]:
# Train Random Forest (often better baseline for tabular features)
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)  # RF handles unscaled features well

y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:,1]

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest ROC AUC:", roc_auc_score(y_test, y_proba_rf))
print("\nClassification Report (RF):\n", classification_report(y_test, y_pred_rf))
