* [Libraries](#Libraries)
* [Data](#Data)
* [Exploratory Data Analysis](#Exploratory-Data-Analysis)
* [Data Manipulation](#Data-Manipulation)
* [Churn Analysis](#Churn-Analysis)
* [Customer Segmentation](#Customer-Segmentation)
* [Segmentation Report](#Segmentation-Report)

In [None]:
#!pip install --user scikit-learn==0.23.1
!pip install chart_studio
#Undersampling eski sckit-learn kutuphanesinde sorunlu gozukuyor.
#Guncellemekte fayda var

# Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
plt.style.use("fivethirtyeight")
import seaborn as sns
import chart_studio.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
pyoff.init_notebook_mode()

from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import LocalOutlierFactor
from sklearn.impute import KNNImputer
from imblearn.over_sampling import BorderlineSMOTE

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from xgboost.sklearn import XGBClassifier
from sklearn.cluster import KMeans

import sklearn.metrics as metrics
from statistics import *
from sklearn.metrics import classification_report
from sklearn.metrics import auc, f1_score, roc_auc_score, roc_curve, confusion_matrix, accuracy_score, precision_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Visualize tools**

In [None]:
from matplotlib import pyplot as plt
plt.style.use("fivethirtyeight")

def pie(slices,labels,explode,shadow,autopct,title):
    """
    Function responsible to draw pie chart
    PArams:
    slices:- list of data values to be plot in pie chart
    labels :- list of lables for each data value
    explode :- list of values in percentage, by which each portion should be cut out from origin
    shadow:-boolean to indicate if shadow is needed or not
    autopact :- sting to fisplay percentage value
    titile:- title of the graph
    """
    plt.pie(slices,labels=labels,explode=explode,wedgeprops={'edgecolor':'black'},shadow=True,autopct='%1.1f%%')
    plt.title("Percentage of customer churned")
    plt.tight_layout()
    plt.show()


def hist(list_of_dataset, list_of_label,ylabel,xlabel,title,bins=None,alpha=0.25,axvline=None,axvlie_label=None,axvlinewidth=None):
    """
    Function responsible to plot histograms
    Params:
    list_of_dataset=dataset to plot, if multiple dataset are used then multiple dataset will stacked on top of each other
    bins= bins to use, default is default bins selected by matplotlib
    alpha = to control transpernce, default is 0.25
    list_of_label = label of histogram 
    ylabel = y axis label
    xlabel = x axis label
    title = title of graph
    axvline = vertical line to be drawn in histogram
    axvline_lable= lable of vertical line
    axvlinewidth= width of vertical line 

    """
    for ind,df in enumerate(list_of_dataset):
        plt.hist(df,bins=[0,5,10,12,20,25,30,35,40,45,50,60],alpha=0.5, label=list_of_label[ind])

    plt.ylabel(ylabel)
    plt.xlabel(xlabel)
    plt.legend()
    plt.tight_layout()
    plt.title(title)
    if axvline:
        plt.axvline(axvline,label=axvlie_label,color='#91ee9a',linewidth=axvlinewidth)
    plt.show()    
    

# Data

In [None]:
df=pd.read_csv('../input/datasets-for-churn-telecom/cell2celltrain.csv')

In [None]:
print("Shape:",df.shape,"\n")
print("Any missing sample in training set:",df.isnull().values.any())
print("Counts:\n",df['Churn'].value_counts(),"\n")
print("Ratio:\n",df.Churn.value_counts()/df.shape[0])

In [None]:
df.head()

In [None]:
churn_df=df[df['Churn']=='Yes']
non_churn_df= df[df['Churn']=='No']
churned= df[df['Churn']=='Yes'].shape[0]
notchurned= df[df['Churn']=='No'].shape[0]
print("Total Churned customer {} ".format(churned))

**Hedef degiskende eksik deger var mi? **

In [None]:
df['Churn'].isna().sum()

**Korelasyonlara bakalim ama daha sonra ilgilenecegiz**

In [None]:
correlation_matrix = df.corr()
plt.figure(figsize=(25, 25))
ax = sns.heatmap(
    correlation_matrix,
    vmax=1,
    square=True,
    annot=True,
    fmt='.2f',
    cmap='GnBu',
    cbar_kws={"shrink": .5},
    robust=True)
plt.title('Correlation Matrix of features', fontsize=8)
plt.show()

# Exploratory Data Analysis

# Sorular

**Data imbalance mi ?**

In [None]:
slices=[churned,notchurned]
labels=['Churned','Not Churned']
explode=[0.1,0]
plt.pie(slices,labels=labels,explode=explode,wedgeprops={'edgecolor':'black'},shadow=True,autopct='%1.1f%%')
plt.title("Percentage of customer churned")
plt.tight_layout()
plt.show()

**Evet imbalance model asamasinda sonuca etkisini gorebilmek icin simdilik boyle kalsin** 

1. Gelir dagilimi nasil? 
2. Yuksek gelirli musterileri kaybediyor muyuz?

**Eksik degerleri gorsellestirmek icin kabaca ortalama ile doldurduk. Model icin KNN impute kullanilacak**

In [None]:
df['MonthlyRevenue'].isnull().sum()

In [None]:
total_monthly_rev=df['MonthlyRevenue'].sum()
total_churned_rev=churn_df['MonthlyRevenue'].sum()
total_nonchrun_rev=non_churn_df['MonthlyRevenue'].sum()
print(total_monthly_rev,total_churned_rev,total_nonchrun_rev)

In [None]:
non_churn_df['MonthlyRevenue'].fillna(non_churn_df['MonthlyRevenue'].mean(),inplace=True)
churn_df['MonthlyRevenue'].fillna(churn_df['MonthlyRevenue'].mean(),inplace=True)
plt.hist(non_churn_df['MonthlyRevenue'],bins=[0,50,100,150,200,250,300,350,400,450,500],alpha=0.5, label='Non Churn customer')
plt.hist(churn_df['MonthlyRevenue'],bins=[0,50,100,150,200,250,300,350,400,450,500],alpha=0.5,label='Churn customer')
mean_rev=df['MonthlyRevenue'].mean()
plt.axvline(mean_rev,label='Mean Monthly Revenue',color='#91ee9a',linewidth=2)
plt.ylabel("No of customers")
plt.xlabel("Monthly revenue")
plt.legend()
plt.tight_layout()
plt.title('Revenue Distribution')
plt.show()

**Yeni musteri mi kaybediyorz yoksa eski musteri mi?**
**Eksik degerleri gorsellestirmek icin kabaca ortalama ile doldurduk. Model icin KNN impute kullanilacak**

In [None]:
df['MonthsInService'].isnull().sum()

In [None]:
non_churn_df['MonthsInService'].describe()

In [None]:
churn_df['MonthsInService'].describe()

In [None]:
non_churn_df['MonthsInService'].fillna(non_churn_df['MonthsInService'].mean(),inplace=True)
churn_df['MonthsInService'].fillna(churn_df['MonthsInService'].mean(),inplace=True)
mean_rev=df['MonthsInService'].mean()
list_df= [non_churn_df['MonthsInService'],churn_df['MonthsInService']]
list_label=['Non Churn customer','Churn customer']
hist(list_df,list_label,ylabel="No of customers",xlabel="Months in service",title="MonthsInService Distribution",
bins=[0,5,10,12,20,25,30,35,40,45,50,60],alpha=0.5,axvline=mean_rev,axvlie_label='Mean MonthsInService',axvlinewidth=2)

**Peki aboneler, abone kaybını tahmin etmek için bunu kullanabilir miyiz?**
**Bireysel müşteri birden fazla aboneye sahip olabilir ve abonelerden birini kapatabilir. 
Bu verileri abone kaybını tahmin etmek için kullanabilir miyiz?**
1. **Veri kümesinde abone kaybı olmayan herhangi bir sütun bulunmadığından, 
müşterinin sahip olduğu unique abonelerin ve kaçının aktif olduğunun çıkarılmasından hesaplanmalıdır.**

In [None]:
df['ChurnSubs']=df['UniqueSubs']-df['ActiveSubs']

In [None]:
slices=[df['ActiveSubs'].sum(),df['ChurnSubs'].sum()]
labels=['Not Churned','Churned']
explode=[0,0.1]
plt.pie(slices,labels=labels,explode=explode,wedgeprops={'edgecolor':'black'},shadow=True,autopct='%1.1f%%')
plt.title("Percentage of subscribers churned")
plt.tight_layout()
plt.show()

In [None]:
print ("No of active subscribers {}".format(df['ActiveSubs'].sum()))
print ("No of churn subscribers {}".format(df['ChurnSubs'].sum()))

**Cagri merkezi ne kadar etkili?**
1. Çağrı merkezine yapılan bekletme çağrıları ne kadar etkilidir?
2. Kaç müşteri tutma çağrısı yaptı?
3. Gerçekte kaç tanesi müşteriyi tuttu?

In [None]:
df['MadeCallToRetentionTeam'].isna().sum()

In [None]:
retention_df=df[df['MadeCallToRetentionTeam']=='Yes']
print ("Total No of customer made call to CSR for retention purpose {}".format(retention_df.shape[0]))
print("No of customer churn after making call {}".format(retention_df[retention_df['Churn']=='Yes'].shape[0]))
print("No of customer retain after making call {}".format(retention_df[retention_df['Churn']=='No'].shape[0]))
print ("Sucess rate of rention call  {}".format(retention_df[retention_df['Churn']=='No'].shape[0]/retention_df.shape[0]))

In [None]:
slices=[retention_df[retention_df['Churn']=='No'].shape[0],retention_df[retention_df['Churn']=='Yes'].shape[0]]
labels=['Not Churned','Churned']
explode=[0,0.1]
plt.pie(slices,labels=labels,explode=explode,wedgeprops={'edgecolor':'black'},shadow=True,autopct='%1.1f%%')
plt.title("success rate of retention calls")
plt.show()

**Elde tutma tekliflerinin başarı oranı?**

1. Elde tutma ne kadar etkili sunuyor?
2. Bekletme aramaları yapan kaç müşteri saklama teklifini kabul etti?
3. Elde tutma teklifine sahip kaç müşterinin gerçekten alıkonulduğu?

In [None]:
df['RetentionOffersAccepted'].isna().sum()

In [None]:
retention_offer_df=df[df['RetentionOffersAccepted']>0]
no_cust_with_ret_offer=retention_offer_df.shape[0]
print("Total customer accepted the retention offer {}".format(no_cust_with_ret_offer))
print("conversion rate customer making call for retention offer then accepting it {}"
      .format(no_cust_with_ret_offer/retention_df.shape[0]))
print("No of customer churn after accepting retention offer {}"
      .format(retention_offer_df[retention_offer_df['Churn']=='Yes'].shape[0]))
print("No of customer retain after accepting retention offer {}"
      .format(retention_offer_df[retention_offer_df['Churn']=='No'].shape[0]))
print("Success rate of  retention offer {}"
      .format(retention_offer_df[retention_offer_df['Churn']=='No'].shape[0]/no_cust_with_ret_offer))

In [None]:
slices=[retention_df[retention_df['Churn']=='No'].shape[0],retention_df[retention_df['Churn']=='Yes'].shape[0]]
labels=['Not Churned','Churned']
explode=[0,0.1]
pie(slices,labels=labels,explode=explode,shadow=True,autopct='%1.1f%%',title="success rate of retention calls")

**Belirli bir gelir grubu müşterisini mi kaybediyoruz?**

In [None]:
df['IncomeGroup'].isna().sum()

In [None]:
income_groups=df['IncomeGroup'].unique()
income_groups.sort()

In [None]:
income_groups#print all income groups

In [None]:
#to create bar ghraph we need list containing no of customer for each income group
churn_customer_per_group=[]
non_churn_customer_per_group=[]
for income in income_groups:
    no_cust=churn_df[churn_df['IncomeGroup'] == income].shape[0]
    normalize_no_cust=no_cust/churn_df.shape[0]
    churn_customer_per_group.append(normalize_no_cust)
    no_cust=non_churn_df[non_churn_df['IncomeGroup'] == income].shape[0]
    normalize_no_cust=no_cust/non_churn_df.shape[0]
    non_churn_customer_per_group.append(normalize_no_cust)

In [None]:
x =  income_groups # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, churn_customer_per_group, width, label='Churn')
rects2 = ax.bar(x + width/2, non_churn_customer_per_group, width, label='Non Churn')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('No Of customers')
ax.set_title('Customer per income group')
ax.set_xticks(x)
ax.set_xticklabels(income_groups)
plt.xlabel("Income Groups")
ax.legend()

**Hangi kredi sinifinda musteri var?**
* **Eksik degerleri gorsellestirmek icin kabaca mod ile doldurduk. Model icin KNN impute kullanilacak**

In [None]:
credit_groups=df['CreditRating'].unique()
credit_groups.sort()
df['CreditRating'].fillna(df['CreditRating'].mode())
churn_customer_per_group=[]
non_churn_customer_per_group=[]
for credit in credit_groups:
    no_cust=churn_df[churn_df['CreditRating'] == credit].shape[0]
    normalize_no_cust=no_cust/churn_df.shape[0]
    churn_customer_per_group.append(normalize_no_cust)
    no_cust=non_churn_df[non_churn_df['CreditRating'] == credit].shape[0]
    normalize_no_cust=no_cust/non_churn_df.shape[0]
    non_churn_customer_per_group.append(normalize_no_cust)

x =  np.arange(len(credit_groups))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, churn_customer_per_group, width, label='Churn')
rects2 = ax.bar(x + width/2, non_churn_customer_per_group, width, label='Non Churn')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('No Of customers')
ax.set_title('Customer per credit group')
ax.set_xticks(x)
ax.set_xticklabels(credit_groups)
plt.xlabel("Credit Group")
ax.legend()



fig.tight_layout()
plt.show()

# Data Manipulation

**Alakasiz datalari silmek?**

1. Katki saglamazlar
2. Onları tutarsak, aslında hiçbir anlam ifade etmediklerinde bu kolonlarla ilişki bulmaya yol açabilir
3. Veri boyutunu artıracak ve bu da daha yüksek bellek ve işlemci hesaplamasına yol açacaktır. 

In [None]:
df.head()

**Sehirleri modele icin hazirladik. Lokasyon onemli olabilir**

In [None]:
lb_make_States = LabelEncoder()
lb_make_City_Neighborhood = LabelEncoder()
df['States']  = df['ServiceArea'].str[0:3]
df['City_Neighborhood']  = df['ServiceArea'].str[-6:]
df['States'] = lb_make_States.fit_transform(df['States'].astype(str))
df['City_Neighborhood'] = lb_make_City_Neighborhood.fit_transform(df['City_Neighborhood'].astype(str))
df =  df.drop(["ServiceArea"], axis=1)

**Binary ve Multi value sutunlar ayrildi**

In [None]:
binary_cols=[]
multi_Value=[]
for col in df.columns :
    if df[col].dtype =='object':
        if df[col].unique().shape[0]==2:
            binary_cols.append(col)
        else:
            multi_Value.append(col)

In [None]:
binary_cols

In [None]:
multi_Value

In [None]:
for col in multi_Value:
    print(col , df[col].unique())

In [None]:
for col in binary_cols:
    print(col , df[col].unique())

%30 dan fazla bilinmez olan sutunlar modele cok bir etkisi olacagini dusunmuyorum


In [None]:
df[df['HandsetPrice']=='Unknown'].shape[0]/df.shape[0]

In [None]:
df[df['HandsetPrice']=='Unknown'].shape

In [None]:
df.drop('HandsetPrice',axis=1, inplace=True)
multi_Value.remove('HandsetPrice')

In [None]:
df[df['Homeownership']=='Unknown'].shape[0]/df.shape[0]

In [None]:
df.drop('Homeownership',axis=1, inplace=True)
binary_cols.remove('Homeownership')

In [None]:
df[df['MaritalStatus']=='Unknown'].shape[0]/df.shape[0]

In [None]:
df.drop('MaritalStatus',axis=1, inplace=True)
multi_Value.remove('MaritalStatus')

**Correlation Check**

1. Artik yuksek korelasyonlara bakabiliriz.Korelatif sutunlarda biri model icin cikartildi.
2. Fakat onemli olacagini dusundugum 'n', 'MonthsInService','MonthlyRevenue', 'MonthlyMinutes' sutunlari kaldi. 

In [None]:
def high_corr_and_check(X):
    corr_matrix = X.corr().abs()
    sol = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
                      .stack()
                      .sort_values(ascending=False))
    for index, value in sol.items():
        if value > 0.7:
            print(index,value)
high_corr_and_check(df)

In [None]:
df = df.drop(['HandsetModels', 'UniqueSubs','PeakCallsInOut','OffPeakCallsInOut','DroppedBlockedCalls',
             'RetentionCalls','InboundCalls','OverageMinutes','ReceivedCalls','CustomerID'],axis=1)

In [None]:
high_corr_and_check(df)

**Cat to Numeric (LabelEncoder)**

Multi cevaplari icin Label encoder kullanildi.

In [None]:
LE_cat = LabelEncoder() 
for i in multi_Value:
    df[i] = LE_cat.fit_transform(df[i].astype(str))
df

1. Yes no cevaplari icin getdummy metodu kullanildi.
2. Fakat Churn sutununu hedef degiskeni oldugu icin replace metodu kullanildi

In [None]:
Binary_cols_expect_churn=binary_cols[1:]
df_yes_no=df[Binary_cols_expect_churn]
df_yes_no

In [None]:
dfDummies = pd.get_dummies(df_yes_no, prefix = df_yes_no.columns)
dfDummies.shape

In [None]:
df.shape

In [None]:
removed_binary_cols=df.drop(binary_cols,axis=1)
removed_binary_cols.head()

In [None]:
clean_dataframe=pd.concat([removed_binary_cols, dfDummies,df['Churn']], axis=1)
clean_dataframe['Churn'].replace('Yes',1,inplace=True)
clean_dataframe['Churn'].replace('No',0,inplace=True)

In [None]:
clean_dataframe.head()

**Missing Values (KNN Imputer)**

**KNN imputation metodu ile benzer ozellikteki deger ile eksik verielr dolduruldu**

In [None]:
bar_df=df.isna().sum()
bar_df[bar_df>0]

In [None]:
plt.barh(bar_df[bar_df>0].index,bar_df[bar_df>0].values)
plt.tight_layout()
plt.show()

In [None]:
imputer = KNNImputer(n_neighbors=5)
df = pd.DataFrame(imputer.fit_transform(clean_dataframe),columns = clean_dataframe.columns)

In [None]:
bar_df=df.isna().sum()
bar_df[bar_df>0]

In [None]:
df

**Outliner Analysis (Local Outliner Factor)**

**LOF metodu ile secilen esik degeri ile outliner degerler model icin baskilandi.**

In [None]:
df_num = df.select_dtypes(include = ['float64','int64'])
clf = LocalOutlierFactor(n_neighbors = 20, contamination = 0.1)
clf.fit_predict(df_num)
df_scores = clf.negative_outlier_factor_
negative_outlier_factor = np.sort(df_scores)[0:50]
negative_outlier_factor

In [None]:
plt.plot(negative_outlier_factor)
plt.show()

In [None]:
esik_deger = np.sort(df_scores)[26]
esik_deger

In [None]:
aykiri_tf = df_scores > esik_deger
aykiri_tf

In [None]:
yeni_df = df_num[df_scores > esik_deger]
yeni_df

In [None]:
baski_degeri = df_num[df_scores == esik_deger]
baski_degeri

In [None]:
ayrikilar = df_num[df_scores < esik_deger]

In [None]:
aykirilar = df_num[~aykiri_tf]
aykirilar

In [None]:
print(aykirilar.index)
len(aykirilar.index)

In [None]:
df = df.drop(index=[1233,  4198,  4330,  6759,  7641,  8198,  9592,  9839, 13272,
            16664, 17427, 20180, 20783, 23267, 23476, 23576, 24711, 25548,
            26135, 28366, 31127, 32144, 37219, 38290, 40706, 47654, 49873])

In [None]:
df = df.append([baski_degeri]*len(aykirilar.index))

In [None]:
df

# Churn Analysis

**Once base model sonra hyperparameter optimizasyonu**

*Neden XGBoost?*

1. XGBoost her değişkene göre kazanç skorunu en yüksek yapacak şekilde olası tüm senaryolarda karar ağaçları kurar. Bu tür algoritmalara “Greedy Algorithm” denir.XGBoost, verideki her değeri incelemek yerine veriyi parçalara(quantile) böler ve bu parçalara göre çalışır. Parça miktarı arttırıldıkça algoritma daha küçük aralıklara bakacak ve daha iyi tahminleme yapacaktır. Tabi ki bu durum modelin öğrenme süresi de artacaktır.Parça sayısı varsayılan olarak 33 tanedir.Bu yaklaşımın problemi tabii ki performans sorunudur. Parçaları belirlemek için her bir kolon sıralanmalı, parçaların sınırları belirlenmeli ve ağaçlar kurulmalıdır. Bu durum yavaşlığa sebep olur.Sorunu aşmak için “Sketches” adı verilen algoritma kullanılır. Amacı parçaları bulmak için yakınsama yapmasıdır.XGboost Ağırlıklandırılmış Sketches Algoritması ile parçaların sınırlarını belirler.Ağırlık = Önceki Değer * (1 – Önceki Değer) olarak belirlenir. Ağırlık ne kadar fazlaysa tahmin o kadar kararsızdır. Parçalar bu ağırlıklara göre belirlenir. Ağırlıklar yaklaşık olacak şekilde parçalara bölünür. Bu sayede kararsız tahmin değerlerinin olduğu dallar daha küçük aralıklara bölünmüş olur. Bu durum daha doğru tahminlemeye yardımcı olacaktır.
2. Bilgisayarlarımızda hard disk, RAM ve önbellek gibi farklı bellek türleri bulunmaktadır. Ön bellek, bellekler arasında en hızlı kullanılan ancak en küçük alana sahip olandır. Bir programın hızlı çalışması isteniyorsa ön bellek maksimum seviyede kullanılmalıdır.XGBoost benzerlik skoru ve ağaç çıktılarını(output value) ön bellekte hesaplatır. Bu sebeple hızlı hesaplamalar yapılabilmektedir.

**XGBoost parametreleri **

1. 'n_estimators':modelde kurulacak ağaç sayısı.  (secilen degerler :[100, 500, 1000])
2. 'subsample':herbir ağacı oluşturmak için alınan satır oranı (secilen degerler :[0.6, 0.8, 1.0])
3. 'max_depth':ağacın derinliği                (secilen degerler :[3, 4, 5,6])
4. 'learning_rate': Ogrenmek icin adim sayisi           (secilen degerler :[0.1,0.01,0.02,0.05])

**Base Model**

In [None]:
y = df["Churn"]
X = df.drop(["Churn"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20, random_state=42)

In [None]:
xgb_model = XGBClassifier().fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

In [None]:
xgb_model

In [None]:
cm = confusion_matrix(y_test,y_pred)
sns.heatmap(cm, annot= True, fmt =".0f", linewidth=.3, square =True, cmap = 'PuBu')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
model = xgb_model.fit(X_train, y_train)
y_pred = model.predict(X_test)
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show() 
print(classification_report(y_test,y_pred))

**Dataset imbalanced oldugu icin '0' lar fazla '1' degerleri icin sorun yasiyoruz** 

* Undersampling ya da Oversampling gerekli. 
* UnderSampling metodunda patern kaybi oldugu icin Oversampling uygulandi

**AUC = 0.67**

 **Recall degeri bizim icin cok onemli**

**base modelde recall =0.56 1'ler icin recall = 0.20**

**OVERSAMPLING**

In [None]:
y = df['Churn'].values
X = df.drop(['Churn'], axis=1)

In [None]:
oversample = BorderlineSMOTE()
X_resampled, y_resampled = oversample.fit_resample(X, y)

In [None]:
print('original dataset shape:', Counter(y))
print('Resample dataset shape', Counter(y_resampled))

**Balance olan Data optimize edebilmek icin 5 katli CV kullanildi**

**Optimizasyon**

In [None]:
!nvidia-smi

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,test_size=0.20, random_state=42)

In [None]:
xgb_params = {'n_estimators': [100, 500, 1000],
              'subsample': [0.6, 0.8, 1.0],
              'max_depth': [3, 4, 5,6],
              'learning_rate': [0.1,0.01,0.02,0.05]}
xgb = XGBClassifier(tree_method = 'gpu_hist',predictor='gpu_predictor')
xgb_cv_model = GridSearchCV(xgb, xgb_params, cv = 5, verbose = 2)
xgb_cv_model.fit(X_train, y_train)
print("En iyi Parametreler : ",xgb_cv_model.best_params_)

In [None]:
xgb = XGBClassifier(learning_rate = xgb_cv_model.best_params_['learning_rate'], 
                    max_depth = xgb_cv_model.best_params_['max_depth'], 
                    n_estimators = xgb_cv_model.best_params_['n_estimators'], 
                    subsample = xgb_cv_model.best_params_['subsample'],
                    tree_method = 'gpu_hist',predictor='gpu_predictor')

xgb_tuned =  xgb.fit(X_train,y_train)
y_pred = xgb_tuned.predict(X_test)

In [None]:
xgb_tuned

In [None]:
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show() 
print(classification_report(y_test,y_pred))

In [None]:
plot_confusion_matrix(xgb_tuned, 
                      X_test, 
                      y_test,
                      values_format='d',
                      display_labels=["Churn degil", "Churn"])

**Imbalancelik giderildigi icin RECALL = 0.66 AUC=81 e yukseldi **

**Feature of importance Holdout data icin cok onemli**

In [None]:
featureImpList= []
for feature, importance in zip(X, xgb_tuned.feature_importances_):  
    temp = [feature, importance*100]
    featureImpList.append(temp)

fT_df = pd.DataFrame(featureImpList, columns = ['Feature', 'Importance'])[:62]
print (fT_df.sort_values('Importance', ascending = False))

**Holdout**

In [None]:
test=pd.read_csv("/kaggle/input/datasets-for-churn-telecom/cell2cellholdout.csv")

In [None]:
test['ChurnSubs']=test['UniqueSubs']-test['ActiveSubs']
lb_make_States_test = LabelEncoder()
lb_make_City_Neighborhood_test = LabelEncoder()
test['States']  = test['ServiceArea'].str[0:3]
test['City_Neighborhood']  = test['ServiceArea'].str[-6:]
test['States'] = lb_make_States.fit_transform(test['States'].astype(str))
test['City_Neighborhood'] = lb_make_City_Neighborhood_test.fit_transform(test['City_Neighborhood'].astype(str))
test =  test.drop(["ServiceArea"], axis=1)
binary_cols_test=[]
multi_Value_test=[]
for col in test.columns :
    if test[col].dtype =='object':
        if test[col].unique().shape[0]==2:
            binary_cols_test.append(col)
        else:
            multi_Value_test.append(col)
test.drop('HandsetPrice',axis=1, inplace=True)
multi_Value_test.remove('HandsetPrice')
test.drop('Homeownership',axis=1, inplace=True)
binary_cols_test.remove('Homeownership')
test.drop('MaritalStatus',axis=1, inplace=True)
multi_Value_test.remove('MaritalStatus')
LE_cat_test = LabelEncoder() 
for i in multi_Value:
    test[i] = LE_cat_test.fit_transform(test[i].astype(str))
Binary_cols_expect_churn_test=binary_cols_test[:]
test_yes_no_test=test[Binary_cols_expect_churn_test]
testDummies_test = pd.get_dummies(test_yes_no_test, prefix = test_yes_no_test.columns)
removed_binary_cols_test=test.drop(binary_cols_test,axis=1)
clean_dataframe_test=pd.concat([removed_binary_cols_test, testDummies_test], axis=1)
clean_dataframe_test = clean_dataframe_test.drop(['Churn'], axis=1)
imputer = KNNImputer(n_neighbors=5)
test = pd.DataFrame(imputer.fit_transform(clean_dataframe_test),columns = clean_dataframe_test.columns)
test

In [None]:
predictor = test[fT_df['Feature']]
predictor['label'] = xgb_tuned.predict(predictor)
Score = predictor['label']
predictor.drop(labels=['label'], axis=1,inplace = True)
predictor.insert(0, 'label', Score)
predictor_sort = predictor.sort_values(by=['label'],ascending=False)
predictor_sort.head()

In [None]:
churn_sayisi=len(predictor_sort[predictor_sort['label']==1.0])

In [None]:
print ("Churn olacak kisi sayisi:",int(churn_sayisi))
print ("Churn olacak kisi sayisi orani:",int(churn_sayisi)/len(test)*100)

# Customer Segmentation

**MonthlyRevenue : Ne kadar harciyor?**

**MonthlyMinutes : Kac dk konusuyor?**

**MonthsInService : Ne kadardir bizde?**

In [None]:
def order_cluster(cluster_field_name, target_field_name,df,ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
    df_final = df_final.drop([cluster_field_name],axis=1)
    df_final = df_final.rename(columns={"index":cluster_field_name})
    return df_final

In [None]:
df_segmentation = test[['CustomerID','MonthlyRevenue','MonthlyMinutes','MonthsInService']]
df_segmentation

**MonthlyMinutes : Kac dk konusuyor?**

In [None]:
CustomerperMonthlyMinutes = pd.DataFrame(test[['CustomerID','MonthlyMinutes']])
CustomerperMonthlyMinutes

In [None]:
plot_data = [
    go.Histogram(
        x=CustomerperMonthlyMinutes['MonthlyMinutes']
    )
]

plot_layout = go.Layout(
        title='MonthlyMinutes'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

In [None]:
sse={}
CustomerperMonthlyMinutes = CustomerperMonthlyMinutes[['MonthlyMinutes']]
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(CustomerperMonthlyMinutes)
    CustomerperMonthlyMinutes["MonthlyMinutesCluster"] = kmeans.labels_
    sse[k] = kmeans.inertia_ 
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(CustomerperMonthlyMinutes[['MonthlyMinutes']])
CustomerperMonthlyMinutes['MonthlyMinutesCluster'] = kmeans.predict(CustomerperMonthlyMinutes[['MonthlyMinutes']])

In [None]:
CustomerperMonthlyMinutes

In [None]:
df_segmentation_MonthlyMinutes = order_cluster('MonthlyMinutesCluster', 'MonthlyMinutes',CustomerperMonthlyMinutes,True)

In [None]:
df_segmentation_MonthlyMinutes.groupby('MonthlyMinutesCluster')['MonthlyMinutes'].describe()

**MonthlyRevenue : Ne kadar harciyor?** 

In [None]:
CustomerperMonthlyRevenue = pd.DataFrame(test[['CustomerID','MonthlyRevenue']])
CustomerperMonthlyRevenue

In [None]:
plot_data = [
    go.Histogram(
        x=CustomerperMonthlyRevenue['MonthlyRevenue']
    )
]

plot_layout = go.Layout(
        title='MonthlyRevenue'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

In [None]:
sse={}
CustomerperMonthlyRevenue = CustomerperMonthlyRevenue[['MonthlyRevenue']]
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(CustomerperMonthlyRevenue)
    CustomerperMonthlyRevenue["MonthlyRevenueCluster"] = kmeans.labels_
    sse[k] = kmeans.inertia_ 
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(CustomerperMonthlyRevenue[['MonthlyRevenue']])
CustomerperMonthlyRevenue['MonthlyRevenueCluster'] = kmeans.predict(CustomerperMonthlyRevenue[['MonthlyRevenue']])

In [None]:
df_segmentation_MonthlyRevenue = order_cluster('MonthlyRevenueCluster', 'MonthlyRevenue',CustomerperMonthlyRevenue,True)

In [None]:
df_segmentation_MonthlyRevenue.groupby('MonthlyRevenueCluster')['MonthlyRevenue'].describe()

**MonthsInService : Ne kadardir bizde?**

In [None]:
CustomerperMonthsInService = pd.DataFrame(test[['CustomerID','MonthsInService']])
CustomerperMonthsInService

In [None]:
plot_data = [
    go.Histogram(
        x=CustomerperMonthsInService['MonthsInService']
    )
]

plot_layout = go.Layout(
        title='MonthsInService'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

In [None]:
sse={}
CustomerperMonthsInService = CustomerperMonthsInService[['MonthsInService']]
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(CustomerperMonthsInService)
    CustomerperMonthsInService["MonthsInServiceCluster"] = kmeans.labels_
    sse[k] = kmeans.inertia_ 
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(CustomerperMonthsInService[['MonthsInService']])
CustomerperMonthsInService['MonthsInServiceCluster'] = kmeans.predict(CustomerperMonthsInService[['MonthsInService']])

In [None]:
df_segmentation_MonthsInService = order_cluster('MonthsInServiceCluster', 'MonthsInService',CustomerperMonthsInService,True)

In [None]:
df_segmentation_MonthsInService.groupby('MonthsInServiceCluster')['MonthsInService'].describe()

**Overall**

In [None]:
df_segmentation['MonthlyRevenueCluster'] = df_segmentation_MonthlyRevenue['MonthlyRevenueCluster']
df_segmentation['MonthlyMinutesCluster'] = df_segmentation_MonthlyMinutes['MonthlyMinutesCluster']
df_segmentation['MonthsInServiceCluster'] = df_segmentation_MonthsInService['MonthsInServiceCluster']


In [None]:
df_segmentation.head()

In [None]:
df_segmentation['OverallScore'] = df_segmentation['MonthsInServiceCluster'] + df_segmentation['MonthlyMinutesCluster'] + df_segmentation['MonthlyRevenueCluster']
df_segmentation.groupby('OverallScore')['MonthsInService','MonthlyMinutes','MonthlyRevenue'].mean()


In [None]:
df_segmentation['Segment'] = 'Low-Value'
df_segmentation.loc[df_segmentation['OverallScore']>1,'Segment'] = 'Mid-Value' 
df_segmentation.loc[df_segmentation['OverallScore']>3,'Segment'] = 'High-Value' 

In [None]:
df_segmentation

# Segmentation Report

Her 3 gruba da farkli cozumler sunulmus olup maliyet hesabi yapilmistir.
* Ust segment: kaydetmeyi goze alamayacagimiz musterilere 12 ay tahahut sozuyle her ay 100 dk verilirse ve cagri merkezinin teklifini Kabul ederse
* Orta segment: ust segmente gecmesini hedefledigimiz musterilere 12 ay tahahutune aylik ucretlerinde %5 indirim ve her ay 100 dk hediye ve cagri merkezinin teklifini Kabul ederse
* Alt segment: kazanmaya calistigimiz musterilere 12 ay tahahutune 2000 dk bedava konusma verilirse ve cagri merkezinin teklifini Kabul ederse


In [None]:
df_segmentation=df_segmentation.set_index('CustomerID')
Segment = df_segmentation['Segment']
df = test.set_index('CustomerID')
df_values = df.join(Segment)

**Ust segment**

In [None]:
High_Value = df_values[(df_values['Segment'] == 'High-Value')]
High_Value

In [None]:
High_Value = High_Value[fT_df['Feature']]
High_Value['Score'] = xgb_tuned.predict(High_Value)
Label = High_Value['Score']
High_Value.drop(labels=['Score'], axis=1,inplace = True)
High_Value.insert(0, 'Score', Label)
High_Value_sort = High_Value.sort_values(by=['Score'],ascending=False)
High_Value_sort.head()

In [None]:
High_Value_sort_churn=High_Value_sort[High_Value_sort['Score'] == 1]
len(High_Value_sort_churn)

**12 ay tahahut sozuyle her ay 100 dk verilirse**

In [None]:
High_Value_sort_churn = High_Value_sort_churn[fT_df['Feature']]
High_Value_sort_churn.MonthsInService = High_Value_sort_churn.MonthsInService.add(12)
High_Value_sort_churn.MonthlyMinutes = High_Value_sort_churn.MonthlyMinutes.add(100)
High_Value_sort_churn.RetentionOffersAccepted = High_Value_sort_churn.RetentionOffersAccepted.add(1)
High_Value_sort_churn.MadeCallToRetentionTeam_Yes = High_Value_sort_churn.MadeCallToRetentionTeam_Yes.add(1)
High_Value_sort_churn['Score'] = xgb_tuned.predict(High_Value_sort_churn)
Score = High_Value_sort_churn['Score']
High_Value_sort_churn.drop(labels=['Score'], axis=1,inplace = True)
High_Value_sort_churn.insert(0, 'Score', Score)
High_Value_sort_churn_sort = High_Value_sort_churn.sort_values(by=['Score'],ascending=False)
High_Value_sort_churn.head()

In [None]:
churnden_kurtulan_kampanya_sayisi=len(High_Value_sort_churn[High_Value_sort_churn['Score']==0])
churnden_kurtulan_kazanc = sum(High_Value_sort_churn['MonthlyRevenue'])
print('churnden kurtulan kampanya sayisi',churnden_kurtulan_kampanya_sayisi)
print('churnden kurtulan kazanc',int(churnden_kurtulan_kazanc))

**Orta segment**

In [None]:
Mid_Value = df_values[(df_values['Segment'] == 'Mid-Value')]
Mid_Value

In [None]:
Mid_Value = Mid_Value[fT_df['Feature']]
Mid_Value['Score'] = xgb_tuned.predict(Mid_Value)
Label = Mid_Value['Score']
Mid_Value.drop(labels=['Score'], axis=1,inplace = True)
Mid_Value.insert(0, 'Score', Label)
Mid_Value_sort = Mid_Value.sort_values(by=['Score'],ascending=False)
Mid_Value_sort.head()

In [None]:
Mid_Value_sort_churn=Mid_Value_sort[Mid_Value_sort['Score'] == 1]
len(Mid_Value_sort_churn)

** 12 ay tahahutune aylik ucretlerinde %5 indirim ve her ay 100 dk hediye ** 

In [None]:
Mid_Value_sort_churn = Mid_Value_sort_churn[fT_df['Feature']]
Mid_Value_sort_churn.MonthsInService = Mid_Value_sort_churn.MonthsInService.add(12)
Mid_Value_sort_churn.MonthlyMinutes = Mid_Value_sort_churn.MonthlyMinutes.add(100)
Mid_Value_sort_churn.RetentionOffersAccepted = Mid_Value_sort_churn.RetentionOffersAccepted.add(1)
Mid_Value_sort_churn.MadeCallToRetentionTeam_Yes = Mid_Value_sort_churn.MadeCallToRetentionTeam_Yes.add(1)
Mid_Value_sort_churn['Score'] = xgb_tuned.predict(Mid_Value_sort_churn)
Score = Mid_Value_sort_churn['Score']
Mid_Value_sort_churn.drop(labels=['Score'], axis=1,inplace = True)
Mid_Value_sort_churn.insert(0, 'Score', Score)
Mid_Value_sort_churn_sort = Mid_Value_sort_churn.sort_values(by=['Score'],ascending=False)
Mid_Value_sort_churn.head()

In [None]:
churnden_kurtulan_kampanya_sayisi=len(Mid_Value_sort_churn[Mid_Value_sort_churn['Score']==0])
churnden_kurtulan_kazanc = sum(Mid_Value_sort_churn['MonthlyRevenue'])
print('churnden kurtulan kampanya sayisi',churnden_kurtulan_kampanya_sayisi)
print('churnden kurtulan kazanc',int(churnden_kurtulan_kazanc))

**Alt segment**

In [None]:
Low_Value = df_values[(df_values['Segment'] == 'Low-Value')]
Low_Value

In [None]:
Low_Value = Low_Value[fT_df['Feature']]
Low_Value['Score'] = xgb_tuned.predict(Low_Value)
Label = Low_Value['Score']
Low_Value.drop(labels=['Score'], axis=1,inplace = True)
Low_Value.insert(0, 'Score', Label)
Low_Value_sort = Low_Value.sort_values(by=['Score'],ascending=False)
Low_Value_sort.head()

In [None]:
Low_Value_sort_churn=Low_Value_sort[Low_Value_sort['Score'] == 1]
len(Low_Value_sort_churn)

** 12 ay tahahutune 2000 dk bedava konusma verilirse ** 

In [None]:
Low_Value_sort_churn = Low_Value_sort_churn[fT_df['Feature']]
Low_Value_sort_churn.MonthsInService = Low_Value_sort_churn.MonthsInService.add(12)
Low_Value_sort_churn.MonthlyMinutes = Low_Value_sort_churn.MonthlyMinutes.add(500)
Low_Value_sort_churn.RetentionOffersAccepted = Low_Value_sort_churn.RetentionOffersAccepted.add(1)
Low_Value_sort_churn.MadeCallToRetentionTeam_Yes = Low_Value_sort_churn.MadeCallToRetentionTeam_Yes.add(1)
Low_Value_sort_churn['Score'] = xgb_tuned.predict(Low_Value_sort_churn)
Score = Low_Value_sort_churn['Score']
Low_Value_sort_churn.drop(labels=['Score'], axis=1,inplace = True)
Low_Value_sort_churn.insert(0, 'Score', Score)
Low_Value_sort_churn_sort = Low_Value_sort_churn.sort_values(by=['Score'],ascending=False)
Low_Value_sort_churn.head()

In [None]:
churnden_kurtulan_kampanya_sayisi=len(Low_Value_sort_churn[Low_Value_sort_churn['Score']==0])
churnden_kurtulan_kazanc = sum(Low_Value_sort_churn['MonthlyRevenue'])
print('churnden kurtulan kampanya sayisi',churnden_kurtulan_kampanya_sayisi)
print('churnden kurtulan kazanc',int(churnden_kurtulan_kazanc))