In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

1. Load and Explore Data:

In [45]:
import pandas as pd
import numpy as np

# Load the main dataset
data = pd.read_csv("CRM-Contacts_clean.csv")

# Function to clean 'Geburtsjahr' column
def cleanGeburtsjahr(x):
    if (len(str(x))) == 2 and x > 24:
        return 1900 + x
    if (len(str(x))) != 4:
        return 0
    return x

# Function to transform a column to int
def transformColumnToInt(name):
    data[name] = pd.to_numeric(data[name], errors='coerce').fillna(0).astype('int')

# Replace spaces in column values with underscores
data = data.replace(' ', '_', regex=True)

# Specify columns to convert to integers
columnsToInt = ['Rentenbeitraege', 'Einreisejahr', 'Number_Of_Chats', 'Kinder', 'phone_pension',
                'temporary_right_of_residence_since', 'Anzahl_Kinder_unter_25_Jahre', 'acquired_right_of_residence']

# Convert specified columns to integers
for column in columnsToInt:
    transformColumnToInt(column)

# Clean the 'Geburtsjahr' column
data['Geburtsjahr'] = data['Geburtsjahr'].apply(cleanGeburtsjahr)

# Continue to the next steps in your project.


  data = pd.read_csv("CRM-Contacts_clean.csv")


In [46]:
# Create the feature matrix (X) and target variable (y)
selected_features = [
    'Geburtsjahr', 'Einreisejahr', 'phone_net_income', 'receives_support_from_job_center',
    'basis_for_naturalization_check', 'Familienstand', 'Rentenbeitraege', 'Number_Of_Chats',
    'completed_job_training', 'Kinder', 'Rente', 'phone_pension', 'utm_campaign', 'Visitor_Score',
    'Other_State', 'Anzahl_Kinder_unter_25_Jahre', 'acquired_right_of_residence', 'language_certificate',
    'Minijob', 'Email_Opt_Out', 'Integrationsnachweis', 'application_permanent_right_of_residence',
    'Was_wollen_Sie', 'basis_for_naturalization', 'graduation', 'asylum_status'
]

X = data[selected_features]


# Preprocess the 'phone_net_income' column to extract numeric values
X.loc[:, 'phone_net_income'] = X['phone_net_income'].str.extract('(\d+)').astype(float)


# Identify categorical columns
categorical_columns = ['Familienstand', 'basis_for_naturalization_check', 'Minijob', 'asylum_status']

# One-hot encode categorical columns
X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)

y = data['sales']



In [61]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  # You can choose another classifier if needed

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose a machine learning algorithm (e.g., Random Forest)
model = RandomForestClassifier(random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
confusion = confusion_matrix(y_test, y_pred)

# Display the evaluation results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("\nConfusion Matrix:\n", confusion)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9143421664342166
Precision: 0.8958307851037359
Recall: 0.9143421664342166
F1 Score: 0.9040858025143292

Confusion Matrix:
 [[7664  202   11    0    1    0]
 [ 407  202    6    2    0    0]
 [  50   43    1    0    0    0]
 [   5    8    0    0    0    0]
 [   1    0    0    0    0    0]
 [   0    1    0    0    0    0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.96      7878
           1       0.44      0.33      0.38       617
           2       0.06      0.01      0.02        94
           3       0.00      0.00      0.00        13
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1

    accuracy                           0.91      8604
   macro avg       0.24      0.22      0.23      8604
weighted avg       0.90      0.91      0.90      8604



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [62]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    # Add other hyperparameters as needed
}

# Create a grid search object
grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)

# Fit the grid search to your data
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Use the best model for predictions and evaluation
y_pred = best_model.predict(X_test)
# Continue with evaluation as in the previous step


In [63]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score

# Evaluate the model's performance on the test set
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Precision, Recall, F1-score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')

# Confusion Matrix
confusion = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(confusion)

# Classification Report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

# ROC-AUC Score (for binary classification)
if len(np.unique(y_test)) == 2:
    y_prob = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_prob)
    print(f'ROC-AUC Score: {roc_auc:.2f}')


Accuracy: 0.91
Precision: 0.90
Recall: 0.91
F1-score: 0.90
Confusion Matrix:
[[7664  202   11    0    1    0]
 [ 407  202    6    2    0    0]
 [  50   43    1    0    0    0]
 [   5    8    0    0    0    0]
 [   1    0    0    0    0    0]
 [   0    1    0    0    0    0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.96      7878
           1       0.44      0.33      0.38       617
           2       0.06      0.01      0.02        94
           3       0.00      0.00      0.00        13
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1

    accuracy                           0.91      8604
   macro avg       0.24      0.22      0.23      8604
weighted avg       0.90      0.91      0.90      8604



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [64]:
# Feature Importance Analysis (for RandomForestClassifier)
if isinstance(model, RandomForestClassifier):
    feature_importance = model.feature_importances_
    feature_names = X.columns
    importance_dict = dict(zip(feature_names, feature_importance))
    sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)

    # Print feature importance
    print('Feature Importance:')
    for feature, importance in sorted_importance:
        print(f'{feature}: {importance:.2f}')


Feature Importance:
Visitor_Score: 0.51
phone_pension: 0.11
Einreisejahr: 0.07
acquired_right_of_residence: 0.06
Anzahl_Kinder_unter_25_Jahre: 0.04
Minijob_kein_Minijob_vorhanden: 0.04
Familienstand_ledig: 0.04
Kinder: 0.04
Rentenbeitraege: 0.03
Familienstand_verheiratet_(anerkannt): 0.02
asylum_status_Flüchtlingseigenschaft: 0.01
Minijob_unbefristet: 0.01
asylum_status_Subsidiärer_Schutz: 0.01
Number_Of_Chats: 0.01
Email_Opt_Out: 0.00
basis_for_naturalization_check_9_Abs._2: 0.00
basis_for_naturalization_check_26_Abs._4: 0.00
basis_for_naturalization_check_26_Abs._3_Satz_1: 0.00
Geburtsjahr: 0.00
basis_for_naturalization_check_26_Abs._3_Satz_3: 0.00
basis_for_naturalization_check_18c_Abs._1_Satz_1_iVm_Satz_2: 0.00
basis_for_naturalization_check_18c_Abs._2: 0.00
basis_for_naturalization_check_35_Abs._1_Satz_1: 0.00
basis_for_naturalization_check_18c_Abs._3: 0.00
basis_for_naturalization_check_35_Abs._1_Satz_2: 0.00
basis_for_naturalization_check_28_Abs._2_Satz_1: 0.00
basis_for_natural

In [65]:
# Train the final model on the entire dataset (assuming you have already tuned hyperparameters)
final_model = RandomForestClassifier(random_state=42)  # Replace with your final model
final_model.fit(X, y)  # Use the entire dataset X and y


In [67]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  # You can choose another classifier if needed
import joblib  # To save the trained model

# Assuming you have already prepared your dataset as X (features) and y (target labels)

# Split the data into training and testing sets (at least 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose a machine learning algorithm (e.g., Random Forest)
model = RandomForestClassifier(random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

# Evaluate the model's performance on the test data
accuracy = model.score(X_test, y_test)
print(f'Accuracy: {accuracy:.2f}')

# Save the trained model for future predictions
joblib.dump(model, 'sales_prediction_model.pkl')


Accuracy: 0.91


['sales_prediction_model.pkl']