In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import shap

# Load the Sales Pipeline Data
sales_pipeline = pd.read_csv('sales_pipeline.csv')

# Encode Categorical Variables
label_encoders = {}
for col in ['sales_agent', 'product', 'account', 'deal_stage']:
    le = LabelEncoder()
    sales_pipeline[col] = le.fit_transform(sales_pipeline[col])
    label_encoders[col] = le

# Feature Engineering: Create New Features
sales_pipeline['engage_to_close_days'] = (pd.to_datetime(sales_pipeline['close_date']) - pd.to_datetime(sales_pipeline['engage_date'])).dt.days
sales_pipeline['is_won'] = (sales_pipeline['deal_stage'] == label_encoders['deal_stage'].transform(['Won'])[0]).astype(int)

# Handle missing values
sales_pipeline.fillna({'engage_to_close_days': sales_pipeline['engage_to_close_days'].median()}, inplace=True)

# Define Features and Target Variable
features = ['sales_agent', 'product', 'account', 'engage_to_close_days', 'close_value']
X = sales_pipeline[features]
y = sales_pipeline['is_won']

# Scale numerical features for models like Logistic Regression & Neural Network
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split Data into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train Multiple AI Models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=200, learning_rate=0.1, random_state=42, use_label_encoder=False, eval_metric="logloss"),
    "Logistic Regression": LogisticRegression(),
    "Neural Network (MLP)": MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', max_iter=500, random_state=42)
}

# Train and Evaluate Models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

# Select Best Model (Based on Accuracy)
best_model = max(models, key=lambda name: accuracy_score(y_test, models[name].predict(X_test)))
final_model = models[best_model]
print(f"\nBest Model Selected: {best_model}")

# SHAP Analysis for Feature Importance
explainer = shap.Explainer(final_model, X_train)
shap_values = explainer(X_test)

shap.summary_plot(shap_values, X_test)

# Function to Predict Closure Probability for Each Opportunity
def predict_closure(account_name):
    if account_name not in sales_pipeline['account'].values:
        return "Account not found in pipeline."

    account_data = sales_pipeline[sales_pipeline['account'] == label_encoders['account'].transform([account_name])[0]]
    account_features = scaler.transform(account_data[features])

    prob = final_model.predict_proba(account_features)[:, 1]
    return f"Predicted deal closure probability for {account_name}: {np.mean(prob):.2f}"

# Identifying Bottlenecks in the Sales Pipeline
def identify_bottlenecks():
    stage_counts = sales_pipeline.groupby('deal_stage')['opportunity_id'].count().reset_index()
    stage_counts.columns = ['deal_stage', 'num_opportunities']
    
    stage_counts['deal_stage'] = label_encoders['deal_stage'].inverse_transform(stage_counts['deal_stage'])

    plt.figure(figsize=(10, 5))
    sns.barplot(x='deal_stage', y='num_opportunities', data=stage_counts, palette='Blues_r')
    plt.xlabel("Sales Pipeline Stage")
    plt.ylabel("Number of Opportunities")
    plt.title("Bottlenecks in the Sales Pipeline")
    plt.xticks(rotation=45)
    plt.show()

identify_bottlenecks()

# Recommend Action Plans for Sales Reps
def recommend_action(account_name):
    if account_name not in sales_pipeline['account'].values:
        return "Account not found in pipeline."
    
    account_data = sales_pipeline[sales_pipeline['account'] == label_encoders['account'].transform([account_name])[0]]
    
    avg_close_days = account_data['engage_to_close_days'].mean()
    avg_close_value = account_data['close_value'].mean()
    
    if avg_close_days > 30:
        return f"Recommend: Prioritize engagement with {account_name} as deals take longer than usual ({avg_close_days:.0f} days)."
    elif avg_close_value < sales_pipeline['close_value'].median():
        return f"Recommend: Offer discounts or upsells to increase deal value for {account_name}."
    else:
        return f"Recommend: Maintain regular follow-ups with {account_name}."
