In [1]:
import pandas as pd

df = pd.read_csv('term-deposit-marketing-2020.csv')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,no


In [2]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
y            object
dtype: object

In [3]:
for col in df.columns:
    print(df[col].value_counts())

32    1909
31    1826
33    1770
34    1730
35    1721
      ... 
94       1
86       1
90       1
81       1
95       1
Name: age, Length: 70, dtype: int64
blue-collar      9383
management       8166
technician       6852
admin            4483
services         3910
retired          1437
self-employed    1414
entrepreneur     1405
unemployed       1104
housemaid        1087
student           524
unknown           235
Name: job, dtype: int64
married     24386
single      10889
divorced     4725
Name: marital, dtype: int64
secondary    20993
tertiary     11206
primary       6270
unknown       1531
Name: education, dtype: int64
no     39191
yes      809
Name: default, dtype: int64
0       3209
1        179
2        141
4        126
3        124
        ... 
4720       1
4313       1
2675       1
4752       1
4667       1
Name: balance, Length: 6849, dtype: int64
yes    24031
no     15969
Name: housing, dtype: int64
no     33070
yes     6930
Name: loan, dtype: int64
cellular     24914
unkn

In [4]:
numerics = ['age','balance','day','duration','campaign']
binaries = ['default','housing','loan','y']
categorial = set(df.columns) - set(numerics) - set(binaries)
categorial

{'contact', 'education', 'job', 'marital', 'month'}

#### Categorial verileri one hot encodingle sayısal veriye çevirme

In [5]:
for colName in categorial:
    cols = pd.get_dummies(df[colName])
    cols.rename(columns={'unknown': colName + 'Unknown'}, inplace=True)
    df = df.drop(colName,axis = 1)
    df = df.join(cols)
    
df

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,y,divorced,...,aug,dec,feb,jan,jul,jun,mar,may,nov,oct
0,58,no,2143,yes,no,5,261,1,no,0,...,0,0,0,0,0,0,0,1,0,0
1,44,no,29,yes,no,5,151,1,no,0,...,0,0,0,0,0,0,0,1,0,0
2,33,no,2,yes,yes,5,76,1,no,0,...,0,0,0,0,0,0,0,1,0,0
3,47,no,1506,yes,no,5,92,1,no,0,...,0,0,0,0,0,0,0,1,0,0
4,33,no,1,no,no,5,198,1,no,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,53,no,395,no,no,3,107,1,no,0,...,0,0,0,0,0,1,0,0,0,0
39996,30,no,3340,no,no,3,238,3,yes,0,...,0,0,0,0,0,1,0,0,0,0
39997,54,no,200,no,no,3,170,1,yes,1,...,0,0,0,0,0,1,0,0,0,0
39998,34,no,1047,no,no,3,342,1,no,0,...,0,0,0,0,0,1,0,0,0,0


#### Binary verileri sayısal veriye çevirme

In [6]:
for colName in binaries:
    df [colName] = df[colName].map({'yes': 1, 'no': 0})
df

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,y,divorced,...,aug,dec,feb,jan,jul,jun,mar,may,nov,oct
0,58,0,2143,1,0,5,261,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,44,0,29,1,0,5,151,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,33,0,2,1,1,5,76,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,47,0,1506,1,0,5,92,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,33,0,1,0,0,5,198,1,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,53,0,395,0,0,3,107,1,0,0,...,0,0,0,0,0,1,0,0,0,0
39996,30,0,3340,0,0,3,238,3,1,0,...,0,0,0,0,0,1,0,0,0,0
39997,54,0,200,0,0,3,170,1,1,1,...,0,0,0,0,0,1,0,0,0,0
39998,34,0,1047,0,0,3,342,1,0,0,...,0,0,0,0,0,1,0,0,0,0


#### Verinin hazırlanıp Logistic regression ile bir model oluşturulması

In [9]:
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

Y_all = np.asarray(df['y'])
X_all = np.asarray(df.drop('y', axis = 1))
X_all = preprocessing.StandardScaler().fit(X_all).transform(X_all)
kf = KFold(n_splits=5,shuffle=True, random_state=1)

def fitModelAndEvaluate(df):
    Y = np.asarray(df['y'])
    X = np.asarray(df.drop('y', axis = 1))
    X = preprocessing.StandardScaler().fit(X).transform(X)
    X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state=1)
    LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,Y_train)
    yhat = LR.predict(X_test)
    print("Accuracy score: " + str(accuracy_score(Y_test, yhat)))
    print (classification_report(Y_test, yhat))
    return LR
    

In [10]:
model = fitModelAndEvaluate(df)
cvScores = cross_val_score(model, X_all, Y_all, cv=kf)
print("Cross validation scores: " + str(cvScores))
print("Average cross validation performance: " + str(cvScores.mean()*100))
    

Accuracy score: 0.93675
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      7435
           1       0.62      0.27      0.38       565

    accuracy                           0.94      8000
   macro avg       0.78      0.63      0.67      8000
weighted avg       0.92      0.94      0.93      8000

Cross validation scores: [0.93675  0.936125 0.934    0.932375 0.93275 ]
Average cross validation performance: 93.44000000000001


#### İstenen accuracy %81 olmasına rağmen 5'e bölünmüş cross validation skoru %93 çıkıyor. Aslında elimizde imbalanced (dengesiz) bir veri olduğu için accuracy bir metrik olarak kullanılmamalı. 1'ler için Recall %27 çıkıyor, yani vadeli hesabı kabul edecek olanların %27'si doğru tahmin edilmiş. Zaten az müşteri vadeli hesap kabul ettiği için onların hiçbirini kaçırmak istemeyiz o yüzden burada önemli olan metrik recalldur.

## Veri dengesiz olduğu için undersampling yöntemini deniyorum. Yani tüm verinin %92'sini oluşturan 0'ları azaltıp yeniden deneyeceğim

In [11]:
zero_indices = df[df['y'] == 0].index
zero_indices = np.random.choice(zero_indices, 2896, replace=False) #%50-50
one_indices = df[df['y'] == 1].index
undersampled = pd.concat([df.loc[one_indices],df.loc[zero_indices]])
undersampled

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,y,divorced,...,aug,dec,feb,jan,jul,jun,mar,may,nov,oct
83,59,0,2343,1,0,5,1042,1,1,0,...,0,0,0,0,0,0,0,1,0,0
86,56,0,45,0,0,5,1467,1,1,0,...,0,0,0,0,0,0,0,1,0,0
87,41,0,1270,1,0,5,1389,1,1,0,...,0,0,0,0,0,0,0,1,0,0
129,55,0,2476,1,0,5,579,1,1,0,...,0,0,0,0,0,0,0,1,0,0
168,54,0,184,0,0,5,673,2,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20162,30,0,44,0,0,11,187,2,0,0,...,1,0,0,0,0,0,0,0,0,0
24001,36,0,861,0,0,29,140,2,0,1,...,1,0,0,0,0,0,0,0,0,0
24929,37,0,1205,1,0,18,99,1,0,0,...,0,0,0,0,0,0,0,0,1,0
491,29,0,1,1,0,6,215,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [12]:
model2 = fitModelAndEvaluate(undersampled)

Accuracy score: 0.8576358930112166
              precision    recall  f1-score   support

           0       0.85      0.86      0.86       574
           1       0.86      0.85      0.86       585

    accuracy                           0.86      1159
   macro avg       0.86      0.86      0.86      1159
weighted avg       0.86      0.86      0.86      1159



In [13]:
kf = KFold(n_splits=5,shuffle=True, random_state=2)
cvScores = cross_val_score(model2, X_all, Y_all, cv=kf)
print("Cross validation scores: " + str(cvScores))
print("Average cross validation performance: " + str(cvScores.mean()*100))

Cross validation scores: [0.935    0.93625  0.9345   0.93175  0.934875]
Average cross validation performance: 93.44750000000002


In [14]:
yhat = model2.predict(X_all)
print(classification_report(Y_all, yhat))

              precision    recall  f1-score   support

           0       1.00      0.64      0.78     37104
           1       0.18      0.99      0.30      2896

    accuracy                           0.67     40000
   macro avg       0.59      0.81      0.54     40000
weighted avg       0.94      0.67      0.75     40000



#### Recall değeri ikinci lojistik regresyon modeliyle %99'a çıktı, bu tarz bir model daha etkili olacaktır.

In [15]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,y,divorced,married,single,primary,secondary,tertiary,educationUnknown,admin,blue-collar,entrepreneur,housemaid,management,retired,self-employed,services,student,technician,unemployed,jobUnknown,cellular,telephone,contactUnknown,apr,aug,dec,feb,jan,jul,jun,mar,may,nov,oct
age,1.0,-0.01,0.08,-0.18,0.0,-0.01,-0.04,0.02,-0.02,0.16,0.27,-0.41,0.18,-0.1,-0.07,0.07,-0.05,-0.03,0.03,0.08,-0.01,0.33,0.0,-0.06,-0.16,-0.06,0.01,0.04,-0.07,0.12,0.01,-0.03,0.09,-0.01,-0.01,-0.02,0.02,0.09,0.0,-0.13,0.04,0.0
default,-0.01,1.0,-0.07,-0.02,0.08,0.01,-0.01,0.01,-0.01,0.02,-0.02,0.01,-0.0,0.01,-0.01,0.0,-0.01,0.0,0.03,0.0,-0.0,-0.0,0.0,-0.0,-0.01,-0.0,0.01,-0.01,-0.0,-0.02,0.01,-0.03,-0.01,-0.0,-0.01,-0.01,0.04,0.01,-0.01,-0.01,0.01,-0.01
balance,0.08,-0.07,1.0,-0.05,-0.08,0.01,0.01,-0.01,0.03,-0.03,0.02,-0.01,-0.02,-0.06,0.08,0.01,-0.02,-0.04,0.01,0.0,0.07,0.02,0.02,-0.03,0.0,-0.01,0.01,0.01,0.0,0.03,-0.02,0.02,0.01,-0.0,-0.01,-0.03,-0.07,0.04,0.02,-0.06,0.14,0.03
housing,-0.18,-0.02,-0.05,1.0,0.0,-0.05,0.02,-0.05,-0.05,-0.0,0.0,0.0,0.01,0.09,-0.1,-0.04,0.04,0.16,0.0,-0.09,-0.07,-0.11,-0.03,0.06,-0.03,-0.03,-0.05,-0.08,-0.14,-0.06,0.18,0.08,-0.34,-0.01,-0.07,-0.07,-0.08,-0.12,-0.06,0.42,-0.01,-0.01
loan,0.0,0.08,-0.08,0.0,1.0,0.01,-0.0,0.0,-0.03,0.02,0.03,-0.05,-0.01,0.07,-0.04,-0.05,0.03,0.01,0.04,-0.02,-0.04,0.02,-0.01,0.03,-0.05,0.01,-0.04,-0.03,0.03,-0.0,-0.03,-0.03,-0.07,-0.0,-0.01,-0.0,0.17,-0.03,-0.02,-0.05,0.02,0.0
day,-0.01,0.01,0.01,-0.05,0.01,1.0,-0.03,0.17,-0.01,-0.0,0.01,-0.0,-0.03,-0.01,0.03,0.0,-0.01,-0.04,-0.0,0.01,0.03,-0.0,0.01,-0.01,-0.01,0.03,-0.01,-0.01,0.03,0.04,-0.05,0.04,0.04,-0.01,-0.32,0.27,0.16,-0.21,-0.01,-0.05,0.11,0.04
duration,-0.04,-0.01,0.01,0.02,-0.0,-0.03,1.0,-0.09,0.46,0.01,-0.03,0.03,-0.01,0.01,-0.0,-0.0,-0.01,0.02,0.0,-0.01,-0.01,0.0,0.01,0.0,-0.01,-0.01,0.02,-0.01,0.02,-0.04,0.0,0.04,-0.04,-0.01,-0.01,0.01,0.02,-0.02,-0.01,0.01,-0.0,0.01
campaign,0.02,0.01,-0.01,-0.05,0.0,0.17,-0.09,1.0,-0.04,-0.02,0.03,-0.02,0.0,-0.03,0.02,0.01,-0.02,-0.0,-0.0,0.0,0.02,-0.01,0.0,-0.01,-0.01,0.02,-0.02,0.02,-0.02,0.06,-0.01,-0.08,0.18,-0.01,-0.03,-0.07,0.1,0.05,-0.01,-0.09,-0.09,-0.03
y,-0.02,-0.01,0.03,-0.05,-0.03,-0.01,0.46,-0.04,1.0,0.01,-0.06,0.05,-0.03,-0.02,0.05,-0.01,0.01,-0.03,-0.01,-0.02,0.02,0.02,0.01,-0.01,0.04,0.0,0.01,-0.0,0.09,-0.0,-0.09,0.1,-0.03,0.0,0.04,-0.03,-0.02,-0.01,0.13,-0.04,-0.01,0.09
divorced,0.16,0.02,-0.03,-0.0,0.02,-0.0,0.01,-0.02,0.01,1.0,-0.46,-0.22,-0.02,0.02,-0.0,-0.02,0.04,-0.07,-0.0,0.02,0.01,0.04,-0.02,0.02,-0.04,0.01,0.01,-0.01,-0.0,-0.02,0.01,-0.02,-0.03,0.01,0.0,0.0,0.01,0.01,-0.01,0.0,0.01,0.01


#### Bonuslar için: Çağrı merkezi hangi müşteriye öncelik vermelidir? Yukarıda <i>duration</i> ile y arasında bir corelasyon olduğu gözüküyor.  Yani arandığı zaman karşı tarafı dinleyen müşteriler kampanyayla daha fazla ilgileniyormuş. Bunun yanında lojistik regresyon modeli sadece 1 ve 0'ları değil, aşağıdaki gibi 1 ve 0 olma ihtimallerini de hesaplıyor. İhtimali yüksek olana da öncelik verilebilir. 

In [16]:
yhat_prob = model2.predict_proba(X_all)
yhat_prob

array([[0.6214096 , 0.3785904 ],
       [0.78272404, 0.21727596],
       [0.92375439, 0.07624561],
       ...,
       [0.29813673, 0.70186327],
       [0.10741114, 0.89258886],
       [0.55027031, 0.44972969]])

## Müşterilerin segmentasyonu

In [19]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

X_cluster = np.asarray(df)
X_cluster = preprocessing.StandardScaler().fit(X_cluster).transform(X_cluster)

range_n_clusters = [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]

for n_clusters in range_n_clusters:
    
    clusterer = KMeans(n_clusters=n_clusters, random_state=1)
    cluster_labels = clusterer.fit_predict(X_cluster)

    silhouette_avg = silhouette_score(X_cluster, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)


For n_clusters = 2 The average silhouette_score is : 0.06444603717464525
For n_clusters = 3 The average silhouette_score is : 0.07470254092331448
For n_clusters = 4 The average silhouette_score is : 0.08245813316392156
For n_clusters = 5 The average silhouette_score is : 0.0866504463975552
For n_clusters = 6 The average silhouette_score is : 0.08572664230047386
For n_clusters = 7 The average silhouette_score is : 0.09693475397136177
For n_clusters = 8 The average silhouette_score is : 0.11673452096814062
For n_clusters = 9 The average silhouette_score is : 0.11957712353415047
For n_clusters = 10 The average silhouette_score is : 0.12251261901830791
For n_clusters = 11 The average silhouette_score is : 0.12518506681540367
For n_clusters = 12 The average silhouette_score is : 0.12451581714958447
For n_clusters = 13 The average silhouette_score is : 0.13053024224233406
For n_clusters = 14 The average silhouette_score is : 0.11862345853712045
For n_clusters = 15 The average silhouette_scor

#### Silhouette skorları çok düşük çıkıyor yani elimizdeki veriyle bir kümeleme yapmak bence pek mantıklı değil