 Load and Prepare the Data

In [1]:
import sklearn
print(sklearn.__version__)

1.6.1


In [2]:
import matplotlib

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb   
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix



ImportError: cannot import name 'pyplot' from 'matplotlib' (unknown location)

In [None]:

# Load the dataset (Assuming you downloaded it and the file is named 'crm_sales_opportunities.csv')
df = pd.read_csv('crm_sales_opportunities.csv')

NameError: name 'pd' is not defined

In [None]:

# Display first few rows of the dataset
print(df.head())

In [None]:
df = df.dropna()

In [None]:

# Feature selection - Identify relevant features (e.g., demographics, firmographics, behavioral data)
features = ['age', 'industry', 'company_size', 'website_visits', 'email_interactions', 'previous_purchases']  # Example columns

In [None]:

# Target variable (Assuming we want to predict 'converted' column)
target = 'converted'  # 1: Converted, 0: Not Converted

X = df[features]
y = df[target]

# If any categorical data exists, encode it
X = pd.get_dummies(X, drop_first=True)  # Converts categorical features to numerical via one-hot encoding

Split Data into Training and Testing Sets

In [None]:
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

In [None]:
df['age_to_company_size'] = df['age'] / df['company_size']  # ratio of age to company size
df['total_interactions'] = df['website_visits'] + df['email_interactions']  # sum of interactions
df['previous_purchase_ratio'] = df['previous_purchases'] / df['total_interactions']  # ratio of previous purchases to interactions

Train the Model (Random Forest)

In [None]:

# Initialize the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Grid Search for Random Forest

In [None]:
# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees
    'max_depth': [None, 10, 20],  # Maximum depth of trees
    'min_samples_split': [2, 5],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2],  # Minimum samples required at a leaf node
}

# Initialize Random Forest model
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Print best parameters
print("Best parameters:", grid_search.best_params_)


Random Search for Random Forest

In [None]:
# Define parameter distribution for Random Search
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

# Initialize Random Forest model
rf = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=100, cv=5, random_state=42, scoring='accuracy')

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Print best parameters
print("Best parameters from Random Search:", random_search.best_params_)

Cross-Validation

In [None]:
# Perform 5-fold cross-validation
cv_scores = cross_val_score(rf, X, y, cv=5, scoring='accuracy')

# Print cross-validation results
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy from cross-validation: {cv_scores.mean()}")

 Lead Scoring (Assign Lead Scores)

In [None]:
# Get the predicted probabilities (output from the model)
y_prob = rf_model.predict_proba(X_test)[:, 1]  # Probability of class 1 (converted)

In [None]:
# Create a DataFrame for predicted probabilities
lead_scores = pd.DataFrame({
    'Lead_ID': df.iloc[X_test.index]['lead_id'],  # Assuming there's a 'lead_id' column for identification
    'Predicted_Probability': y_prob
})

# Sort the leads by predicted probability (score), higher probability means higher priority
lead_scores = lead_scores.sort_values(by='Predicted_Probability', ascending=False)

# Display the top leads with highest probability of conversion
print(lead_scores.head())

Visualization

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show() 

Scalability with Gradient Boosting and XGBoost

In [None]:
# Initialize Gradient Boosting model
gb = GradientBoostingClassifier(random_state=42)

# Train the model
gb.fit(X_train, y_train)

# Make predictions
y_pred_gb = gb.predict(X_test)

# Evaluate the model
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))


In [None]:
# Convert to DMatrix (required by XGBoost)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Specify parameters for XGBoost
params = {
    'objective': 'binary:logistic',
    'max_depth': 6,
    'eta': 0.3,
    'eval_metric': 'logloss'
}

# Train the model
xg_model = xgb.train(params, dtrain, num_boost_round=100)

# Make predictions
y_pred_xgb = xg_model.predict(dtest)
y_pred_xgb = [1 if prob > 0.5 else 0 for prob in y_pred_xgb]  # Convert probabilities to binary class labels

# Evaluate the model
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))


In [None]:
# For Random Forest
print("Random Forest Evaluation:")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# For Gradient Boosting
print("Gradient Boosting Evaluation:")
print(classification_report(y_test, y_pred_gb))
print(confusion_matrix(y_test, y_pred_gb))

# For XGBoost
print("XGBoost Evaluation:")
print(classification_report(y_test, y_pred_xgb))
print(confusion_matrix(y_test, y_pred_xgb))
