In [16]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

In [18]:
# Load the data
data = pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/Customer Conversion Prediction.csv")

In [19]:
# Convert categorical variables into numerical variables
le = LabelEncoder()
data["job"] = le.fit_transform(data["job"])
data["marital"] = le.fit_transform(data["marital"])
data["education_qual"] = le.fit_transform(data["education_qual"])
data["call_type"] = le.fit_transform(data["call_type"])
data["prev_outcome"] = le.fit_transform(data["prev_outcome"])
data["mon"] = le.fit_transform(data["mon"])
data["y"] = le.fit_transform(data["y"])


In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import numpy as np


# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.drop('y', axis=1), data['y'], test_size=0.3, random_state=42)

# Train a logistic regression model on the training set
clf = LogisticRegression(random_state=42, max_iter=1000).fit(X_train, y_train)

# Evaluate the performance of the model using AUROC
y_pred_proba = clf.predict_proba(X_test)[:,1]  # probability of positive class
auroc = roc_auc_score(y_test, y_pred_proba)

# Print AUROC
print("AUROC: {:.2f}".format(auroc))

# Analyze the model to identify important features
if hasattr(clf, 'coef_'):
    # If the model has coefficients, use them to compute feature importance
    feature_importance = np.abs(clf.coef_).ravel()
else:
    # Otherwise, use the feature importance attribute of the model
    feature_importance = clf.feature_importances_

# Sort features in descending order of importance
feature_order = np.argsort(feature_importance)[::-1]

# Print top 10 features and their importance
print("Top 10 features:")
for i in range(10):
    print("{}: {:.2f}".format(X_test.columns[feature_order[i]], feature_importance[feature_order[i]]))


AUROC: 0.85
Top 10 features:
call_type: 0.74
marital: 0.32
education_qual: 0.27
prev_outcome: 0.13
num_calls: 0.13
age: 0.02
job: 0.02
mon: 0.02
day: 0.01
dur: 0.00


In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import numpy as np

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.drop('y', axis=1), data['y'], test_size=0.3, random_state=42)

# Train a Random Forest classifier on the training set
clf = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_train, y_train)

# Evaluate the performance of the model using AUROC
y_pred_proba = clf.predict_proba(X_test)[:,1]  # probability of positive class
auroc = roc_auc_score(y_test, y_pred_proba)

# Print AUROC
print("AUROC: {:.2f}".format(auroc))

# Analyze the model to identify important features
feature_importance = clf.feature_importances_

# Sort features in descending order of importance
feature_order = np.argsort(feature_importance)[::-1]

# Print top 10 features and their importance
print("Top 10 features:")
for i in range(10):
    print("{}: {:.2f}".format(X_test.columns[feature_order[i]], feature_importance[feature_order[i]]))


AUROC: 0.92
Top 10 features:
dur: 0.34
age: 0.14
day: 0.13
mon: 0.11
prev_outcome: 0.08
job: 0.06
num_calls: 0.05
education_qual: 0.03
marital: 0.03
call_type: 0.03


In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
import numpy as np

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.drop('y', axis=1), data['y'], test_size=0.3, random_state=42)

# Train a Gradient Boosting classifier on the training set
clf = GradientBoostingClassifier(n_estimators=100, random_state=42).fit(X_train, y_train)

# Evaluate the performance of the model using AUROC
y_pred_proba = clf.predict_proba(X_test)[:,1]  # probability of positive class
auroc = roc_auc_score(y_test, y_pred_proba)

# Print AUROC
print("AUROC: {:.2f}".format(auroc))

# Analyze the model to identify important features
feature_importance = clf.feature_importances_

# Sort features in descending order of importance
feature_order = np.argsort(feature_importance)[::-1]

# Print top 10 features and their importance
print("Top 10 features:")
for i in range(10):
    print("{}: {:.2f}".format(X_test.columns[feature_order[i]], feature_importance[feature_order[i]]))


AUROC: 0.92
Top 10 features:
dur: 0.48
prev_outcome: 0.19
mon: 0.16
age: 0.07
call_type: 0.06
day: 0.02
education_qual: 0.01
marital: 0.01
num_calls: 0.01
job: 0.00


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import RFE

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an SVM model with RBF kernel
clf = SVC(kernel='rbf', probability=True)
clf.fit(X_train, y_train)

# Use RFE for feature selection
selector = RFE(clf, n_features_to_select=10, step=1)
selector.fit(X_train, y_train)

# Make predictions on the test set
y_pred = selector.predict(X_test)
y_prob = selector.predict_proba(X_test)[:, 1]

# Calculate AUROC
auroc = roc_auc_score(y_test, y_prob)
print('AUROC score:', auroc)

# Identify top 10 important features using RFE
important_features = X.columns[selector.get_support()].tolist()
print('Top 10 important features:', important_features)


AUROC score: 0.7168085276092854
Top 10 important features: ['age', 'job', 'marital', 'education_qual', 'call_type', 'day', 'mon', 'dur', 'num_calls', 'prev_outcome']
