<a href="https://colab.research.google.com/github/sammatiphugate/sammatiphugate/blob/main/Email_Campaign_Effectiveness_Prediction_sammatiphugate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

NAME : SAMMATI PHUGATE

ROLL NO : DS24MS28

PROJECT TITLE : **InspireDirect Email Campaign Effectiveness Prediction**



**Problem Statement**

Goal: Build a model to classify email interactions into categories like "Ignored," "Read," or "Acknowledged," based on email and recipient features.

Impact: Improve InspireDirect's campaign engagement and conversion rates by personalizing emails based on predicted behaviors.



In [None]:
#Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score


**Load Dataset**

In [None]:
df = pd.read_csv('/content/data_email_campaign.csv')


FileNotFoundError: [Errno 2] No such file or directory: '/content/data_email_campaign.csv'

In [None]:
df.head()


In [None]:
df.columns


In [None]:
df.info()


In [None]:
df = df.drop(columns=['Email_ID'])


**Data Cleaning**


In [None]:
# Check for missing values
df.isnull().sum()


In [None]:
# Fill missing values
df['Customer_Location'] = df['Customer_Location'].fillna('Unknown')


In [None]:

df = df.dropna(subset=['Total_Past_Communications', 'Total_Links','Total_Images'])


In [None]:
df.isna().sum()


In [None]:
df.shape


**EDA**


(i) **Univariate Analysis**



In [None]:
# Target distribution
sns.countplot(x='Email_Status', data=df)
plt.title("Target Variable Distribution")
plt.show()

(i) **Bivariate  Analysis**



In [None]:
sns.boxplot(x='Email_Status', y='Subject_Hotness_Score', data=df)
plt.title("Subject Score vs Email_Status")
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(df, y='Total_Past_Communications', x = 'Email_Status')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(df, y='Subject_Hotness_Score',x='Email_Campaign_Type' ,hue = 'Email_Status')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df.select_dtypes(include=[np.number]).corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
sns.boxplot(data=df[['Word_Count', 'Total_Links', 'Total_Images']])
plt.title('Boxplots for Outlier Detection')
plt.show()

**Distributions and Transformations**

In [None]:
from scipy.stats import skew
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Calculate and print skewness before transformation
original_skew = skew(df['Word_Count'])
print("Skewness before transformation:", original_skew)

# Create a figure with 2 subplots
fig, ax = plt.subplots(1, 2, figsize=(12, 5))

# Plot original distribution
sns.histplot(df['Word_Count'], kde=True, ax=ax[0])
ax[0].set_title('Original Word_Count Distribution')

# Apply square root transformation
df['Word_Count_sqrt'] = np.sqrt(df['Word_Count'])

# Calculate and print skewness after transformation
transformed_skew = skew(df['Word_Count_sqrt'])
print("Skewness after transformation (sqrt):", transformed_skew)

# Plot transformed distribution
sns.histplot(df['Word_Count_sqrt'], kde=True, ax=ax[1])
ax[1].set_title('Transformed (sqrt) Word_Count Distribution')

plt.tight_layout()
plt.show()


**Feature Engineering**

In [None]:

# List of categorical features to encode
categorical_columns = ['Email_Type', 'Email_Source_Type', 'Email_Campaign_Type', 'Customer_Location', 'Time_Email_sent_Category']

# Apply one-hot encoding for each column individually
for col in categorical_columns:
    dummies = pd.get_dummies(df[col], prefix=col, drop_first=True)
    df = pd.concat([df.drop(col, axis=1), dummies], axis=1)


**Split Data
 ** *italicized text*

In [None]:
from sklearn.model_selection import train_test_split

# Separate features and target variable using .copy() for safety
features = df.drop(columns=['Email_Status']).copy()
target = df['Email_Status'].copy()

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    features, target,
    test_size=0.2,       # 20% test size
    stratify=target,     # Optional: preserves class distribution in train/test
    random_state=42      # Reproducibility
)


**Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Create a pipeline to standardize and train Logistic Regression
logistic_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('log_reg', LogisticRegression(
        multi_class='multinomial',
        solver='lbfgs',
        max_iter=500,
        random_state=42  # Ensures reproducibility
    ))
])

# Fit the pipeline to the training data
logistic_pipeline.fit(X_train, y_train)


**Prediction & Evaluation**

In [None]:
# Predict class labels
y_pred = logistic_pipeline.predict(X_test)

# Optional: Predict class probabilities
y_proba = logistic_pipeline.predict_proba(X_test)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification Report (includes precision, recall, F1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Accuracy Score
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {acc:.4f}")


**Decision Tree Classifier**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Step 1: Initialize the Decision Tree model with default parameters
decision_tree_model = DecisionTreeClassifier(random_state=42)

# Step 2: Train the model on the training data
decision_tree_model.fit(X_train, y_train)

# Step 3: Make predictions on the test set
dt_predictions = decision_tree_model.predict(X_test)

# Step 4: Evaluate performance
print("\n📊 Decision Tree Classifier Evaluation:")
print("Accuracy Score:", accuracy_score(y_test, dt_predictions))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, dt_predictions))
print("\nClassification Report:\n", classification_report(y_test, dt_predictions))


**Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 2: Train the model
rf_model.fit(X_train, y_train)

# Step 3: Predict on test data
rf_predictions = rf_model.predict(X_test)

# Step 4: Evaluate model performance
print("\n🌲 Random Forest Classifier Evaluation:")
print("Accuracy Score:", accuracy_score(y_test, rf_predictions))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, rf_predictions))
print("\nClassification Report:\n", classification_report(y_test, rf_predictions))


**K-Nearest Neighbors Classifier**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Initialize the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can tune this later

# Step 2: Train the KNN model on scaled training data
knn_model.fit(X_train, y_train)

# Step 3: Predict on scaled test data
knn_predictions = knn_model.predict(X_test)

# Step 4: Evaluate performance
print("\n🤝 K-Nearest Neighbors Classifier Evaluation:")
print("Accuracy Score:", accuracy_score(y_test, knn_predictions))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, knn_predictions))
print("\nClassification Report:\n", classification_report(y_test, knn_predictions))


**Support Vector Machine Classifier**

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Initialize the SVM model with RBF kernel (can try 'linear' or 'poly' for different kernels)
svm_model = SVC(kernel='rbf', random_state=42)

# Step 2: Train the SVM model on scaled training data
svm_model.fit(X_train, y_train)

# Step 3: Predict on scaled test data
svm_predictions = svm_model.predict(X_test)

# Step 4: Evaluate model performance
print("\n🔍 Support Vector Machine (SVM) Classifier Evaluation:")
print("Accuracy Score:", accuracy_score(y_test, svm_predictions))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, svm_predictions))
print("\nClassification Report:\n", classification_report(y_test, svm_predictions))


**Cross Validation for all models**

In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Define the cross-validation function
def cross_val(model, X, y, cv=5):
    """
    Perform cross-validation on the given model and return the mean accuracy score.

    :param model: The machine learning model to evaluate
    :param X: Feature matrix
    :param y: Target variable
    :param cv: Number of folds in cross-validation (default is 5)
    :return: Mean accuracy score
    """
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    return np.mean(scores)

# Dictionary of models to evaluate
models = {
    'Logistic Regression': log_reg,
    'Decision Tree': dt,
    'Random Forest': rf,
    'KNN': knn,
    'SVM': svm
}

# Loop over models and calculate cross-validation score
for name, model in models.items():
    if name in ['Logistic Regression', 'KNN', 'SVM']:  # For models that need scaled data
        score = cross_val(model, X_train, y_train)
    else:  # For tree-based models that don't require scaling
        score = cross_val(model, X_train, y_train)

    # Print the result for each model
    print(f"Cross-Validation Accuracy for {name}: {score:.4f}")


**bold text**