In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load Datasets
accounts = pd.read_csv("accounts.csv")
products = pd.read_csv("products.csv")
sales_teams = pd.read_csv("sales_teams.csv")
sales_pipeline = pd.read_csv("sales_pipeline.csv")

# Merge Datasets on Relevant Keys
sales_data = sales_pipeline.merge(accounts, on="account", how="left")
sales_data = sales_data.merge(products, on="product", how="left")
sales_data = sales_data.merge(sales_teams, on="sales_agent", how="left")

# Feature Engineering: Convert Deal Stage to Binary Outcome (1 = Won, 0 = Lost)
sales_data["deal_won"] = sales_data["deal_stage"].apply(lambda x: 1 if x == "Won" else 0)

# Encode Categorical Features
encoder = LabelEncoder()
categorical_cols = ["sector", "office_location", "subsidiary_of", "series", "sales_agent", "manager", "regional_office"]

for col in categorical_cols:
    sales_data[col] = encoder.fit_transform(sales_data[col].astype(str))

# Select Features for Training
features = ["sector", "year_established", "revenue", "employees", "office_location", "subsidiary_of", 
            "series", "sales_price", "sales_agent", "manager", "regional_office", "close_value"]
X = sales_data[features]
y = sales_data["deal_won"]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize Data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Dictionary to Store Models and Their Results
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "SVM": SVC(probability=True)  # Enables probability predictions
}

# Train and Evaluate Each Model
best_model = None
best_accuracy = 0

for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"{model_name} Accuracy: {accuracy:.2f}")
    print(classification_report(y_test, y_pred))

    # Save Best Model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model_name

print(f"\nBest Model: {best_model} with Accuracy: {best_accuracy:.2f}")

# Assign Lead Scores from the Best Model
lead_scores = models[best_model].predict_proba(X_test)[:, 1] * 100
sales_data.loc[X_test.index, "lead_score"] = lead_scores

# Save Scored Leads
sales_data[["account", "sales_agent", "product", "lead_score"]].to_csv("lead_scores.csv", index=False)
print("Lead scores saved successfully!")
