In [None]:
!pip install pandas scikit-learn matplotlib seaborn

In [None]:
import os
import pandas as pd

os.chdir("../data")
df_1 = pd.read_csv('data_set.csv')
df_2 = pd.read_csv('column_info.csv')

In [None]:
print(df_1)

In [None]:
print(df_2)

In [None]:
# Drop index column
df_1 = df_1.drop(['Unnamed: 0'], axis=1)

In [None]:
# Check datatypes
df_1.dtypes

In [None]:
# Check for missings
df_1.isna().sum()

In [None]:
# See column datatypes
df_1.dtypes

# 2. Preprocessing

In [None]:
cols = [
    'BOUWJAAR_PAND',
    'VLOEROPPERVLAK_VERBLIJFSOBJECT',
    'age',
    'electricity_annual_consumption_estimated_offpeak',
    'electricity_annual_consumption_estimated_peak',
    'electricity_annual_consumption_estimated_total',
    'electricity_last_contract_annual_consumption_estimated_offpeak',
    'electricity_last_contract_annual_consumption_estimated_peak',
    'electricity_last_contract_annual_consumption_estimated_total',
    'gas_annual_consumption_estimated',
    'gas_last_contract_annual_consumption_estimated'
]

for col in cols:
    if col in df_1.columns:
        fill_val = df_1.iloc[:,df_1.columns.get_loc(col)].median()
        df_1[col] = df_1.filter([col]).fillna(fill_val)

In [None]:
# Check if numericals don't have missings
df_1[cols].isna().sum()

In [None]:
from matplotlib.pyplot import figure, title, xlabel, ylabel, show
import seaborn as sns

def plot_cols(data, column):    
    # Check if the column is numeric
    if pd.api.types.is_numeric_dtype(data[column]):
        # Display summary statistics
        print(data[column].describe())

        # Create a histogram
        figure(figsize=(10, 6))
        sns.histplot(data[column], kde=True, bins=20)
        title(f'Histogram of {column}')
        xlabel(column)
        ylabel('Frequency')
        show()

        # Create a box plot
        figure(figsize=(10, 6))
        sns.boxplot(x=data[column])
        title(f'Box Plot of {column}')
        xlabel(column)
        show()
        
    else:
        print(f"{column} is not a numeric column.")

In [None]:
for col in cols:
    plot_cols(df_1, col)

In [None]:
df_1[['electricity_last_contract_term','province']] = df_1[['electricity_last_contract_term','province']].fillna("onbekend")

In [None]:
# Check if categoricals don't have missings
df_1[['electricity_last_contract_term','province']].isna().sum()

In [None]:
cols = [
    'bought_toon',
    'has_active_boiler_rent_contract',
    'has_active_electricity_contract',
    'has_phone_number'
]

for col in cols:
    if col in df_1.columns:
        fill_val = df_1.iloc[:,df_1.columns.get_loc(col)].mode().head()[0]
        df_1[col] = df_1.filter([col]).fillna(fill_val)

In [None]:
# Check if booleans don't have missings
df_1[cols].isna().sum()

In [None]:
# Check if there still is any missing data
df_1.isna().sum()

# 3. Train

In [None]:
# Import libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

In [None]:
# Make dummies
data = pd.get_dummies(df_1, columns=['electricity_last_contract_term','province'], drop_first=True)

In [None]:
# Seperate target variable
X = data.drop('bought_toon', axis=1)
y = data['bought_toon']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #, random_state=42)

# Train a logistic regression model, set iterations high, because data is unscaled
model = LogisticRegression(max_iter=10000, class_weight='balanced')
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# 4. Evaluate

In [None]:
# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix_var = confusion_matrix(y_test, y_pred, labels=[False,  True])
classification_rep = classification_report(y_test, y_pred, zero_division=1)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_rep)
print('Confusion Matrix:')
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_var, display_labels=[False,  True])
disp.plot()
plt.show()

In [None]:
import matplotlib.pyplot as plt

In [None]:
# import pickle

# Save the model to disk
# os.chdir("../models")
# with open('logistic_regression_model_v3.pkl', 'wb') as file:
#     pickle.dump(model, file)

# 5. Predict

In [None]:
# os.chdir("../models") # UNCOMMENT THIS TO LOAD IN A SAVED MODEL (IF IT DOES NOT WORK IS SCIKIT SET TO RIGHT VERSION?)
with open('logistic_regression_model_v4_FINAL_FINAL.pkl', 'rb') as file:
    model = pickle.load(file)

In [None]:
# os.chdir("../data") # UNCOMMENT THIS TO LOAD IN RAW DATA
df_1_raw = pd.read_csv('data_set.csv')

In [None]:
# Get list of customer ID's
customers = df_1_raw[df_1_raw['bought_toon'] == False][['Unnamed: 0']]

In [None]:
# Get model predictions
prediction_prospect = model.predict(data[data['bought_toon']==False].drop('bought_toon', axis=1))
prediction_probability = model.predict_proba(data[data['bought_toon']==False].drop('bought_toon', axis=1))

In [None]:
# Putting results together
from pandas import DataFrame

prospects = DataFrame(prediction_prospect, columns=['possible_prospect']) 
chances = DataFrame(prediction_probability, columns=['reject_toon_chance','buy_toon_chance'])

list_for_marketing = pd.concat([customers.reset_index(drop=True), 
        prospects.reset_index(drop=True),
        chances.reset_index(drop=True)],
        axis=1) \
        .rename(columns={"Unnamed: 0": "customer_id"}) \
        .sort_values(['buy_toon_chance', 'reject_toon_chance'],
              ascending = [False, True])

In [None]:
# Peek at list for marketing
list_for_marketing