In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
!pip install pycaret
!pip install pycaret
from pycaret import classification
from pycaret.classification import * 
import numpy as np 
import pandas as pd
pd.set_option("display.max_columns", 80)
pd.set_option("display.max_rows", 20)
import matplotlib.pyplot as plt
pd.plotting.register_matplotlib_converters()
%matplotlib inline
import seaborn as sns
sns.set(rc={'figure.figsize':(10,7)})
from numpy import mean
from numpy import std
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold, GridSearchCV
from sklearn.feature_selection import VarianceThreshold, SelectKBest,f_classif,mutual_info_classif
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, Lasso
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from itertools import chain, combinations
from xgboost import XGBClassifier
from xgboost import XGBRFClassifier
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

print('ok')

In [None]:
#Loading Data 
dane = pd.read_csv('../input/predicting-profitable-customer-segments/customerTargeting.csv', delimiter=',')
#dane.dataframeName = 'customerTargeting.csv'
#Dropping variables which was recorded after the campaign was run
dane = dane.drop(['g1_21', 'g2_21', 'c_28'], axis = 1)
Y = dane['target']
X = dane.drop(['target'],axis=1)
X_train, X_valid,Y_train,Y_valid = train_test_split(X, Y, test_size = 0.25)
dane.head()

In [None]:
dane.info()

In [None]:
dane.describe()

In [None]:
# Visualization functions
# Distribution graphs (histogram/bar graph) of column data
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()
    
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
    #filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()
    
    
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


# <h1 style="background-color:blue;color:black;text-align:center;">Distribution graphs (histogram/bar graph) of sampled columns</h1>


In [None]:
plotPerColumnDistribution(dane, 30, 3)

# <h1 style="background-color:blue;color:black;text-align:center;">Mutual Information</h1>

In [None]:
mutual_info = mutual_info_classif(X,Y)
mutual_info = pd.Series(mutual_info)
mutual_info.index = X.columns
mutual_info.sort_values(ascending=False)

plt.figure(dpi=100, figsize=(17, 17))
plot_mi_scores(mutual_info)

In [None]:
mutual_info_best = mutual_info[mutual_info > mutual_info.median()]
plot_mi_scores(mutual_info_best)
cols_MI = list(mutual_info_best.index)

# <h1 style="background-color:blue;color:black;text-align:center;">Correlation matrix</h1>


In [None]:
plotCorrelationMatrix(dane[cols_MI], 17)

Checking for constant columns 

In [None]:
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(X)
sum(var_thres.get_support()) #Counting columns with variance threshold by grt_support method
constant_columns = [column for column in X.columns #Checking for contsant columns 
                    if column not in X.columns[var_thres.get_support()]]
print('constant_columns: \n' + str(constant_columns))

Selecting K-Best Features based on Target

In [None]:
#Automatyczna selekcja kolumn z scikit learn
sel_cols = SelectKBest(f_classif, k=10).fit(X[cols_MI],Y)
Best_cols = list(X[cols_MI].columns[sel_cols.get_support()])
cols_ignore = list(set(dane.columns) - set(Best_cols) - set(['target']))
#Podział kolumn na typy
g1_cols = [col for col in X.columns if 'g1_' in col]
g2_cols = [col for col in X.columns if 'g2_' in col]
c_cols = [col for col in X.columns if 'c_' in col]
#Reczny wybór kolumn
J_best_cols = ['g1_1', 'g2_1', 'g1_12', 'g2_12', 'g1_13', 'g2_13'] + c_cols
J_cols_ignore = list(set(dane.columns) - set(J_best_cols) - set(['target']))
#Podsumowanie
print('Best_cols '+str(len(Best_cols))+' \n' + str(Best_cols) )
      #+'\n\nJ_best_cols '+str(len(J_best_cols))+' \n' + str(J_best_cols))

# <h1 style="background-color:blue;color:black;text-align:center;">Analysing Target</h1>

In [None]:
plt.title("Distribution of target variable in numbers")
graph = sns.countplot(x='target', data = dane)
for p in graph.patches:
    graph.annotate('{:.2f}'.format(p.get_height()), (p.get_x()+0.3, p.get_height()),
            ha='center', va='bottom',
            color= 'black')

In [None]:
plt.title("Distribution of target variable in percent")
graph = sns.barplot(x="target", y="target", data=dane, estimator=lambda x: len(x) / len(dane) )
graph.set(ylabel="Percent")
for p in graph.patches:
    graph.annotate("{:.1%}".format(p.get_height()), (p.get_x()+0.3, p.get_height()),
            ha='center', va='bottom',
            color= 'black')

In [None]:
#Pycaret
# Setting up of variables depending on its nature (continuous or Categorical) and also selecting those feature which are not important 
# for further analysis.
classification_setup=setup(data = dane ,target='target', numeric_features=Best_cols,
                           ignore_features = cols_ignore ,silent = True)

#compare_models()

In [None]:
compare_models()

In [None]:
#Random Forest Classifier
my_model = RandomForestClassifier(n_estimators = 200, max_depth = 3, max_features  = 10,
                                 oob_score = True, criterion = 'entropy')
my_model.fit(X[Best_cols], Y)
print('Accuracy: %.2f%%' % (my_model.oob_score_*100))

In [None]:
#Gradient Boosting Classifier
my_model = XGBRFClassifier(n_estimators = 100,  max_depth = 2, 
                           eval_metric = 'mlogloss', learning_rate = 0.01, 
                           booster = 'dart', use_label_encoder=False)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
results = cross_val_score(my_model, X[Best_cols], Y, cv=kfold, scoring = 'accuracy')
print('Accuracy: %.2f%%' % (results.mean()*100))

In [None]:
#Ada Boost Classifier
model = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 1),
                          n_estimators = 20, learning_rate = 0.5)
# evaluate the model
cv = StratifiedKFold(n_splits=5, random_state=1)
n_scores = cross_val_score(model, X, Y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.2f%%' % (mean(n_scores)*100))

In [None]:

#Deep learning - prosta sieć neuronowa
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils import np_utils
from keras.wrappers.scikit_learn import KerasClassifier

seed = 10
np.random.seed(seed)
# Normalize features within range 0 (minimum) and 1 (maximum)
scaler = MinMaxScaler(feature_range=(0, 1))
#X = scaler.fit_transform(X)
#X = pd.DataFrame(X)
# Convert target Y to one hot encoded Y for Neural Network
Y_nn = pd.get_dummies(Y)
# For Keras, convert dataframe to array values (Inbuilt requirement of Keras)
X_nn = X[Best_cols].values
Y_nn = Y_nn.values

X_train, X_valid, Y_train, Y_valid = train_test_split(X_nn, Y_nn, test_size = 0.25)
model = keras.Sequential([
    layers.Dropout(0.05, input_shape=[10]),
    layers.Dense(8, activation='relu'),
    layers.Dense(4, activation='relu'),    
    layers.Dense(3, activation='sigmoid'),
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'],
)

early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)

history = model.fit(
    X_train, Y_train,
    validation_data=(X_valid, Y_valid),
    batch_size=200,
    epochs=200,
    #callbacks=[early_stopping],
    verbose=0, # hide the output because we have so many epochs
)

history_df = pd.DataFrame(history.history)
# Start the plot at epoch 5
history_df.loc[5:, ['loss', 'val_loss']].plot()
history_df.loc[5:, ['accuracy', 'val_accuracy']].plot()

print(("Best Validation Loss: {:0.4f}" +\
      "\nBest Validation Accuracy: {:0.4f}")\
      .format(history_df['val_loss'].min(), 
              history_df['val_accuracy'].max()))

**This is the solution that I was able to get in the available time. The best models have an accuracy very close to 60%. This is certainly a big improvement over randomizing. The model can certainly still be significantly improved. Not all of my work is shown here, I also checked models with a complete set of variables, but they gave worse results. The key factor here seems to be a detailed analysis of the variables and their appropriate selection. Jerzy Szocik. Greetings.**
