# Telco Customer Churn
 
Cilj sprovedene analize jeste predviđanje ponašanja korisnika (da li će raskinuti ugovor sa telekomunikacionom kompanijom ili nastaviti saradnju), kao i procena uticaja pojedinih parametara na donosenje odluke o raskidu ugovora. 

Data set za svakog korisnika sadrži sledeće podatke:
* da li je korisnik raskinuo ugovor u poslednjih mesec dana (feature Churn)
* informacije o servisima na koje je korisnik pretplaćen (phone, multiple lines, internet, online security, online backup, device protection, tech support, streaming TV and streaming movies)
* informacije o korisničkom ugovoru i plaćanjima (tenure - how many months he/she has been a customer, contract type, payment method, paperless billing, monthly charges and total charges)
* demografski podaci o korisniku (gender, age range, if he/she has a partner and dependents)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import seaborn as sns
import math
import phik
import shap
from scipy import stats
from collections import Counter
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from imblearn.under_sampling import CondensedNearestNeighbour, TomekLinks
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from catboost import CatBoostClassifier
from tensorflow.keras import Sequential 
from tensorflow.keras.layers import Input, Dense
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, confusion_matrix, classification_report

sns.set(font_scale=1)
pd.options.display.max_columns = None

In [None]:
# Ucitavanje CSV fajla
data = pd.read_csv("/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [None]:
# Pregled prvih 5 redova dataframe-a
data.head(5)

In [None]:
# Informacije o dataframe-u
data.info()

In [None]:
data['SeniorCitizen'].unique()

In [None]:
# Feature SeniorCitizen je kategoricka promenljiva (zbog grafika 0 cemo zameniti sa No, a 1 sa Yes)
data['SeniorCitizen'] = data['SeniorCitizen'].map({0:'No', 1:'Yes'})

In [None]:
# Cast-ovanje TotalCharges feature-a
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce') # coerce - neispravne vrednosti se zamenjuju sa np.nan

In [None]:
# Provera da li tabela sadrzi prazna polja
data.isnull().sum()

In [None]:
# Brisanje redova sa null vrednoscu (None) u koloni TotalCharges
data.drop(data[data['TotalCharges'].isnull()].index, inplace=True)
data.reset_index(drop=True, inplace=True)

In [None]:
# Provera da li ima vise unosa o istom korisniku
print(data.shape[0])
print(data.customerID.nunique())

In [None]:
# Odredjivanje broja dupliranih redova
data.drop('customerID',axis=1, inplace=True)
print(data.duplicated().sum())

In [None]:
# Brisanje dupliranih redova
index_duplicates = data[data.duplicated()].index
data.drop(index_duplicates, inplace=True)
data.reset_index(drop=True, inplace=True)

In [None]:
# col_num - numericke promenljive
# col_cat - kategoricke promenjive
col_num = data.select_dtypes(include=np.number).columns
col_cat = data.select_dtypes(include='object').columns

In [None]:
# Statisticki podaci o numerickim feature-ima
data.describe()

## Vizualizacija podataka i statisticka analiza - numericki feature-i ('Tenure', 'Monthly charges', 'Total Charges')

In [None]:
plt.figure(figsize=(19,8))

label = ['Tenure', 'Monthly charges', 'Total Charges']

for i in range(len(col_num)):
    ax = plt.subplot(2,3,i+1)
    sns.boxplot(data=data, x="Churn",y=col_num[i])
    ax.set_ylabel(label[i], fontsize=12)
    ax.grid(False)
    
    ax = plt.subplot(2,3,i+4)
    sns.kdeplot(data=data[data.Churn=='No'], x=col_num[i], fill=True, alpha=.5)
    sns.kdeplot(data=data[data.Churn=='Yes'], x=col_num[i], fill=True, alpha=.5)
    ax.set_xlabel(label[i], fontsize=12)
    ax.set_ylabel('Density', fontsize=12)
    ax.legend(['No churn', 'Churn'], loc='upper right')
    ax.grid(False)

In [None]:
def plot_multivariate_dist(data, x_feature, y_feature):
    plt.figure(figsize=(13,4))
    
    ax = plt.subplot(1,2,1)
    sns.kdeplot(data=data[data.Churn=='No'], x=x_feature, y=y_feature, fill=True, alpha=.5)
    ax.title.set_text('No churn')
    ax.grid(False)
    
    ax = plt.subplot(1,2,2)
    sns.kdeplot(data=data[data.Churn=='Yes'], x=x_feature, y=y_feature, fill=True, alpha=.5)
    ax.title.set_text('Churn')
    ax.grid(False)

In [None]:
feature_pairs = [('tenure', 'MonthlyCharges'), ('tenure', 'TotalCharges'), ('MonthlyCharges', 'TotalCharges')]
for feature_pair in feature_pairs:
    plot_multivariate_dist(data, feature_pair[0], feature_pair[1])

In [None]:
# Kruskal Wallis test
kruskal_tenure_churn = stats.kruskal(data[data['Churn']=='No']['tenure'], data[data['Churn']=='Yes']['tenure'])
print('Kruskal Wallis test - churn + tenure')
print(kruskal_tenure_churn)
print('')

kruskal_monthlycharges_churn = stats.kruskal(data[data['Churn']=='No']['MonthlyCharges'], data[data['Churn']=='Yes']['MonthlyCharges'])
print('Kruskal Wallis test - churn + monthly charges')
print(kruskal_monthlycharges_churn)
print('')

kruskal_totalcharges_churn = stats.kruskal(data[data['Churn']=='No']['TotalCharges'], data[data['Churn']=='Yes']['TotalCharges'])
print('Kruskal Wallis test - churn + total charges')
print(kruskal_totalcharges_churn)

In [None]:
# Korelacija promenljivih
data['ChurnInt'] = data['Churn'].map({'No':0, 'Yes':1}).astype(np.uint8)
corr_num = data.corr(method='spearman').round(2)

plt.figure(figsize=(7,5))
sns.heatmap(corr_num, annot=True, cmap='Blues')
ytick = plt.yticks(rotation=0)

data.drop('ChurnInt', axis=1, inplace=True)

In [None]:
# Ispitivanje multikolinearnsti (variance inflation factor)
vif_data = pd.DataFrame()
vif_data["feature"] = col_num 

# proracun VIF faktora za svaki numericki feature
vif_data["VIF"] = [variance_inflation_factor(data[col_num].values, i) for i in range(len(col_num))]
  
print(vif_data)
del vif_data

In [None]:
# Razmatranje iskljucivanja feature-a TotalCharges
vif_data = pd.DataFrame()
vif_data["feature"] = ['tenure','MonthlyCharges']
vif_data["VIF"] = [variance_inflation_factor(data[['tenure','MonthlyCharges']].values, i) for i in range(2)]
  
print(vif_data)
del vif_data

In [None]:
def plot_churn_per_feature_bins(data, feature, x_label, ylim=(0,100)):
    barplot = sns.barplot(data=df[df['Churn']=='Yes'], x=feature, y='Percentage', palette="Blues_d")
    barplot.set(ylim=ylim)
    barplot.set_xlabel(x_label, fontsize=12)
    barplot.set_ylabel('Churn %', fontsize=12)
    plt.show()

In [None]:
# Podela korisnika u 6 grupa na osnovu duzine trajanja saradnje i prikaz procenta raskida ugovora za svaku grupu
data['tenureBins'] = pd.cut(data['tenure'], list(range(0,84,12)))
df = (data.groupby(['tenureBins'])['Churn'].value_counts(normalize=True)
            .rename('Percentage').mul(100).reset_index())

plot_churn_per_feature_bins(df, 'tenureBins', 'Tenure [months]', (0,60))

data.drop('tenureBins', axis=1, inplace=True)
del df

In [None]:
# Za svaki tip ugovora izvrsena je podela korisnika u 6 grupa na osnovu duzine trajanja saradnje
# i prikazan je procenat raskida ugovora za svaku grupu 
for contract in data['Contract'].unique():
    data['tenureBins'] = pd.cut(data['tenure'], list(range(0,84,12)))
    df = (data[data['Contract']==contract].groupby(['tenureBins'])['Churn'].value_counts(normalize=True)
                .rename('Percentage').mul(100).reset_index())

    plot_churn_per_feature_bins(df, 'tenureBins', contract + ' contract: tenure [months]', (0,60))

    data.drop('tenureBins', axis=1, inplace=True)
    del df

In [None]:
# Podela korisnika u 6 grupa na osnovu iznosa mesecne pretplate i prikaz procenta raskida ugovora za svaku grupu
data['MonthlyChargesBins'] = pd.cut(data['MonthlyCharges'], list(range(0,140,20)))
df = (data.groupby(['MonthlyChargesBins'])['Churn'].value_counts(normalize=True)
            .rename('Percentage').mul(100).reset_index())

plot_churn_per_feature_bins(df, 'MonthlyChargesBins', 'Monthly charges', (0,50))

data.drop('MonthlyChargesBins', axis=1, inplace=True)
del df

In [None]:
# Za svaki tip ugovora izvrsena je podela korisnika u 6 grupa na osnovu iznosa mesecne pretplate
# i prikazan je procenat raskida ugovora za svaku grupu 
for contract in data['Contract'].unique():
    data['MonthlyChargesBins'] = pd.cut(data['MonthlyCharges'], list(range(0,140,20)))
    df = (data[data['Contract']==contract].groupby(['MonthlyChargesBins'])['Churn'].value_counts(normalize=True)
                .rename('Percentage').mul(100).reset_index())

    plot_churn_per_feature_bins(df, 'MonthlyChargesBins', contract + ' contract: monthly charges', (0,60))

    data.drop('MonthlyChargesBins', axis=1, inplace=True)
    del df

## Vizualizacija podataka i statisticka analiza (kategoricki feature-i)

In [None]:
data.describe(exclude=np.number)

In [None]:
df = pd.DataFrame()
df['Feature'] = col_cat

uniq_vals = []
count = []
for col in col_cat:
    counter = Counter(data[col])
    uniq_vals.append(counter.keys())
    count.append(counter.values())
df['Unique values'] = uniq_vals
df['Values count'] = count

print(df.to_string())
del df

In [None]:
# Transformacija podataka kako bi se sprecilo postojanje redudantnih feature-a (znacajno i zbog dummy encoding-a i razvoja modela)
data['MultipleLines'] = data['MultipleLines'].map(lambda x: 'No' if x=='No phone service' else x)
data['OnlineSecurity'] = data['OnlineSecurity'].map(lambda x: 'No' if x=='No internet service' else x)
data['OnlineBackup'] = data['OnlineBackup'].map(lambda x: 'No' if x=='No internet service' else x)
data['DeviceProtection'] = data['DeviceProtection'].map(lambda x: 'No' if x=='No internet service' else x)
data['TechSupport'] = data['TechSupport'].map(lambda x: 'No' if x=='No internet service' else x)
data['StreamingTV'] = data['StreamingTV'].map(lambda x: 'No' if x=='No internet service' else x)
data['StreamingMovies'] = data['StreamingMovies'].map(lambda x: 'No' if x=='No internet service' else x)

In [None]:
n_rows = data.shape[0]
df = pd.DataFrame()
df['Feature'] = col_cat

uniq_vals = []
count = []
for col in col_cat:
    counter = Counter(data[col])
    uniq_vals.append(counter.keys())
    count.append(counter.values())
df['Unique values'] = uniq_vals
df['Values count'] = count
df['Dist %'] = [list(c) for c in count]
df['Dist %'] = df['Dist %'].map(lambda x: [round(100.*i/n_rows,2) for i in x])

print(df.to_string())
del uniq_vals
del count
del df

In [None]:
plt.figure(figsize=(18,24))

label = ['Gender', 'Senior citizen', 'Partner', 'Dependents', 'Phone service',
         'Multiple lines', 'Internet service', 'Online security', 'Online backup',
         'Device protection', 'Tech support', 'Streaming TV', 'Streaming movies',
         'Contract', 'Paperless billing', 'Payment method', 'Churn']

for i in range(len(col_cat)):   
    ax = plt.subplot(6,3,i+1)    
    sns.countplot(data=data, x=col_cat[i])
    ax.set_xlabel(label[i], fontsize=12)
    ax.tick_params('x', labelrotation=8)
    
plt.subplots_adjust(left=0.1, bottom=0.1, right=0.9, 
                    top=0.9, wspace=0.3, hspace=0.3)

In [None]:
plt.figure(figsize=(18,24))

label = label[:-1]

for i in range(len(col_cat[:-1])):  
    df = (data.groupby(col_cat[i])['Churn'].value_counts(normalize=True)
            .rename('Percentage').mul(100).reset_index())
    
    ax = plt.subplot(6,3,i+1)
    barplot = sns.barplot(data=df, x=col_cat[i], y='Percentage', hue='Churn')
    ax.set(ylim=(0, 100))
    ax.set_xlabel(label[i], fontsize=12)
    ax.tick_params('x', labelrotation=8) 
    
    for p in barplot.patches:
        barplot.annotate(format(p.get_height(), '.1f'), 
                       (p.get_x() + p.get_width() / 2., p.get_height()), 
                       ha = 'center', va = 'center', 
                       xytext = (0, 9), 
                       textcoords = 'offset points')
        
plt.subplots_adjust(left=0.1, bottom=0.1, right=0.9, 
                    top=0.9, wspace=0.3, hspace=0.3)

In [None]:
def plot_boxplots(cols, labels):
    n = len(cols)
    plt.figure(figsize=(12,4*n))
    i=1
    for col,label in zip(cols,labels):
        ax = plt.subplot(n,2,i)
        sns.boxplot(data=data, x=col, y='MonthlyCharges', hue='Churn')
        ax.set_xlabel(label, fontsize=12)
        ax.set_ylabel('Monthly charges', fontsize=12)
        ax.legend(bbox_to_anchor=(1.01,1), borderaxespad=0)
        ax.tick_params('x', labelrotation=8)
        ax.grid(False)

        ax = plt.subplot(n,2,i+1)
        sns.boxplot(data=data, x=col, y='tenure', hue='Churn')
        ax.set_xlabel(label, fontsize=12)
        ax.set_ylabel('Tenure', fontsize=12)
        ax.legend(bbox_to_anchor=(1.01,1), borderaxespad=0)
        ax.tick_params('x', labelrotation=8)
        ax.grid(False)

        i += 2

    plt.subplots_adjust(left=0.1, bottom=0.1, right=0.9, 
                        top=0.9, wspace=0.3, hspace=0.3)

In [None]:
cols = ['SeniorCitizen','gender','Partner','Dependents']
labels = ['Senior citizen', 'Gender','Partner','Dependents']

plot_boxplots(cols, labels)

In [None]:
data['Percenet of churn'] = data['Churn'].map(lambda x: 1 if x=='Yes' else 0)

colors = ['C0', 'C1']
ax = sns.catplot(data=data, x='Partner', y='Percenet of churn', hue='Dependents', col='SeniorCitizen', 
                 kind='point', dodge=True, palette=colors, height=3, aspect=1.5)

data.drop(['Percenet of churn'], axis=1, inplace=True)

ax = sns.catplot(data=data, x='Partner', y='tenure', hue='Dependents', col='SeniorCitizen', 
                 kind='point', dodge=True, palette=colors, height=3, aspect=1.5)

ax = sns.catplot(data=data, x='Partner', y='MonthlyCharges', hue='Dependents', col='SeniorCitizen', 
                 kind='point', dodge=True, palette=colors, height=3, aspect=1.5)

In [None]:
cols = ['PhoneService', 'MultipleLines', 'InternetService',
        'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
        'StreamingTV', 'StreamingMovies']
labels = ['Phone service', 'Multiple lines', 'Internet service',
          'Online security', 'Online backup', 'Device protection', 'Tech support',
          'Streaming TV', 'Streaming movies']

plot_boxplots(cols, labels)

In [None]:
def cramers_corr_coef(crosstab):
    chi2 = stats.chi2_contingency(crosstab)[0]
    n = crosstab.sum().sum()
    phi2 = chi2/n
    r,k = crosstab.shape
    phi2corr = max(0, phi2-(k-1)*(r-1)/(n-1))    
    rcorr = r - (r-1)**2/(n-1)
    kcorr = k - (k-1)**2/(n-1)
    
    return np.sqrt(phi2corr/min(kcorr-1,rcorr-1))

def cramers_corr_matrix(data, col_cat):
    n_cat = len(col_cat)
    cramers_corr = np.ones((n_cat, n_cat))
    
    for r in range(n_cat):
        for c in range(n_cat):
            if (col_cat[r]!=col_cat[c]) & (cramers_corr[r][c]==1):
                crosstab = pd.crosstab(data[col_cat[r]],data[col_cat[c]])
                cramers_corr[r][c] = cramers_corr_coef(crosstab)
                cramers_corr[c][r] = cramers_corr[r][c]
    
    return cramers_corr

In [None]:
cramers_corr = pd.DataFrame(cramers_corr_matrix(data,col_cat), columns=col_cat, index = col_cat).round(2)            
fig, ax = plt.subplots(figsize=(15,12))
sns.heatmap(cramers_corr, annot=True, ax=ax, cmap='Blues')

## Razvoj machine learning modela

In [None]:
data.drop('TotalCharges', axis=1, inplace=True)
col_num = list(col_num)
col_num.remove('TotalCharges')

# Dummy encoding kategorickih promenljivih
data = pd.get_dummies(data, drop_first=True)

# Podela podataka na train i test setove
X_train, X_test, y_train, y_test = train_test_split(data.drop(['Churn_Yes'], axis=1), data['Churn_Yes'], test_size = 0.20, random_state = 7, stratify=data['Churn_Yes'].values)

# stanarizacija numerickih promenjivih
scaler= StandardScaler()
X_train[col_num] = scaler.fit_transform(X_train[col_num])
X_test[col_num] = scaler.transform(X_test[col_num])

### Pomocne funkcije za hyperparameter tuning i evaluaciju modela

In [None]:
def grid_search_cv(model_name, X_train, y_train, print_search_results=False):
    if model_name=='SVC':
        model = SVC(probability=True, class_weight='balanced')
        kernel = ['poly', 'rbf', 'sigmoid']
        C = [50, 10, 1.0, 0.1, 0.01]
        gamma = ['scale']
        param_grid = dict(kernel=kernel,C=C,gamma=gamma)
    if model_name=='LogisticRegression':
        model = LogisticRegression(class_weight='balanced')
        param_grid = {'C': [100, 10, 1.0, 0.1, 0.01]}
    if model_name=='RandomForestClassifier':
        model = RandomForestClassifier(class_weight='balanced')
        n_estimators = [50, 100, 500, 1000]
        max_depth = [4, 6, 8],
        max_features = ['sqrt']
        param_grid = dict(n_estimators=n_estimators,max_depth=max_depth)#,max_features=max_features)
    if model_name=='XGBClassifier':
        model = XGBClassifier(use_label_encoder=False, scale_pos_weight=4)
        learning_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
        n_estimators = [50, 100, 500, 1000, 5000]
        param_grid = dict(learning_rate=learning_rate,n_estimators=n_estimators)
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
    search = GridSearchCV(model, param_grid, scoring='neg_log_loss', cv=cv)
    search_result = search.fit(X_train, y_train)
    
    print('Best result:\n%f\nBest parameters:\n%s' %(search_result.best_score_, search_result.best_params_))
    
    if print_search_results==True:
        means = search_result.cv_results_['mean_test_score']
        stds = search_result.cv_results_['std_test_score']
        parameters = search_result.cv_results_['params']
        for mean,stdev,param in zip(means,stds,parameters):
            print("%.3f (%.3f) with: %r" % (mean, stdev, param))

In [None]:
def best_threshold(target, probability):
    best_threshold = None
    best_score = 0
    
    for threshold in np.arange(0.05, 1, 0.05):
        prediction = probability>threshold
        score = f1_score(target, prediction)
        if score>best_score:
            best_threshold = threshold
            best_score = score
    
    return best_threshold

In [None]:
def model_pred_capability(model_name, y_train, pred_train, y_test, pred_test, pred_prob=None, plot_roc_curve=False):
    print('MODEL: ' + model_name + '\n')
    
    print('Classification report - train data')
    print(classification_report(y_train, pred_train))
    
    print('\n\nClassification report - test data')
    print(classification_report(y_test, pred_test))
    
    print('\n')
    
    plt.figure(figsize=(5,4))
    sns.heatmap(confusion_matrix(y_train, pred_train), annot=True, fmt = "d", cmap='Blues').set_title('Confusion matrix - train data')
    plt.show()
    
    print('\n')
    
    plt.figure(figsize=(5,4))
    sns.heatmap(confusion_matrix(y_test, pred_test), annot=True, fmt = "d", cmap='Blues').set_title('Confusion matrix - test data')
    plt.show()
    
    print('\n')
    
    if plot_roc_curve:
        fpr_svc, tpr_svc, thresholds = roc_curve(y_test, pred_prob)
        plt.plot([0, 1], [0, 1], 'k--' )
        plt.plot(fpr_svc, tpr_svc)
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(model_name + ' - ROC Curve',fontsize=14)
        plt.show()

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):    
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    
    if 'ANN' in model_name:
        pred_train = np.round(pred_train).flatten()
        pred_test = np.round(pred_test).flatten()
        pred_prob = model.predict(X_test).flatten()
    else:
        pred_prob = model.predict_proba(X_test)[:,1]
        
    model_pred_capability(model_name, y_train, pred_train, y_test, pred_test, pred_prob, True)

In [None]:
def evaluate_best_threshold(model, X_train, X_test, y_train, y_test, best_threshold, model_name):
    print('Best threshold: %.2f' %best_threshold)
    
    pred_train = model.predict_proba(X_train)[:,1]>0.55
    pred_test = model.predict_proba(X_test)[:,1]>0.55
    
    model_pred_capability(model_name, y_train, pred_train, y_test, pred_test)

### Hyperparameter tuning

In [None]:
models = ['LogisticRegression', 'SVC', 'XGBClassifier']
for model in models:
    print('Model: ' + model)
    grid_search_cv(model, X_train, y_train, False)
    print('')

### SVM

In [None]:
svc_model = SVC(probability=True, C=0.01, gamma='scale', kernel='poly', class_weight='balanced')
svc_model.fit(X_train,y_train)
evaluate_model(svc_model, X_train, X_test, y_train, y_test, 'SVC')

### Logistic Regression

In [None]:
log_model = LogisticRegression(C=10,class_weight='balanced')
log_model.fit(X_train, y_train)
evaluate_model(log_model, X_train, X_test, y_train, y_test, 'Logistic Regression')

**Procena uticaja feature-a**

In [None]:
weights = pd.DataFrame({'Feature': X_train.columns.values, 'Coef': log_model.coef_[0], 'Abs Coef': np.abs(log_model.coef_[0])})
weights = weights.sort_values(by='Abs Coef', ascending=False).reset_index(drop=True)

plt.figure(figsize=(8,8))
sns.barplot(data=weights, x='Coef', y='Feature')

In [None]:
weights['Coeficient'] = weights['Coef'].round(3)
weights['Odds'] = weights['Coef'].map(lambda x: np.exp(x)) 
weights['Odds'] = weights['Odds'].round(2)
weights.drop(['Abs Coef','Coef'], axis=1, inplace=True)
weights

### Logistic Regression + feature interaction

In [None]:
cols = list(X_train.columns)
for col in col_num:
    cols.remove(col)

X_train_inter = X_train.copy()
X_test_inter = X_test.copy()
for cat_feature in cols:
    for num_feature in col_num:
        X_train_inter[num_feature+'_'+cat_feature] = X_train_inter[num_feature]*X_train_inter[cat_feature]
        X_test_inter[num_feature+'_'+cat_feature] = X_test_inter[num_feature]*X_test_inter[cat_feature]
        
log_model_inter = LogisticRegression(solver='sag',C=10, max_iter=1000,class_weight='balanced')
log_model_inter.fit(X_train_inter, y_train)
evaluate_model(log_model_inter, X_train_inter, X_test_inter, y_train, y_test, 'Logistic Regression + feature interactions')

### Random Forest 

In [None]:
rf_model = RandomForestClassifier(n_estimators=1000, max_depth=6, class_weight='balanced')
rf_model.fit(X_train, y_train)
evaluate_model(rf_model, X_train, X_test, y_train, y_test, 'Random Forest')

**SHAP analiza**

In [None]:
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_train)

shap.summary_plot(shap_values[1], X_train)

### Extreme Gradient Boosting 

In [None]:
xgb_model = XGBClassifier(use_label_encoder=False, learning_rate=0.01, n_estimators=1000, scale_pos_weight=4)
xgb_model.fit(X_train, y_train)
evaluate_model(xgb_model, X_train, X_test, y_train, y_test, 'XGB Classifier')

### Artificial Neural Network

In [None]:
N, D = X_train.shape

ann_model = Sequential([
    Input(shape=(D,)),
    Dense(5, activation='sigmoid'),
    Dense(1, activation='sigmoid')
])

ann_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
r = ann_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=30, verbose=0)

plt.figure(figsize=(10,4))
ax = plt.subplot(1,2,1)
plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['val_loss'], label='val_loss')
plt.legend()

ax = plt.subplot(1,2,2)
plt.plot(r.history['accuracy'], label='accuracy')
plt.plot(r.history['val_accuracy'], label='val_accuracy')
plt.legend()

evaluate_model(ann_model, X_train, X_test, y_train, y_test, 'ANN model')

# Oversampling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(['Churn_Yes'], axis=1), data['Churn_Yes'], test_size = 0.20, random_state = 7, stratify=data['Churn_Yes'].values)

min_max_scaler = MinMaxScaler(feature_range=(0,1))
X_train[col_num] = min_max_scaler.fit_transform(X_train[col_num])
X_test[col_num] = min_max_scaler.transform(X_test[col_num])

sm = SMOTE(random_state = 7)#ADASYN()
X_train,y_train = sm.fit_resample(X_train,y_train)

**Random Forest Classifier**

In [None]:
rf_model = RandomForestClassifier(n_estimators=1000, max_features='sqrt', max_depth=6)
rf_model.fit(X_train,y_train)
evaluate_model(rf_model, X_train, X_test, y_train, y_test, 'Random Forest + SMOTE')

**Artificial Neural Network**

In [None]:
N, D = X_train.shape

ann_model = Sequential([
    Input(shape=(D,)),
    Dense(8, activation='sigmoid'),
    Dense(1, activation='sigmoid')
])

ann_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
r = ann_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=30, verbose=0)
evaluate_model(ann_model, X_train, X_test, y_train, y_test, 'ANN model + SMOTE')

# Undersampling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(['Churn_Yes'], axis=1), data['Churn_Yes'], test_size = 0.20, random_state = 7, stratify=data['Churn_Yes'].values)

min_max_scaler = MinMaxScaler(feature_range=(0,1))
X_train[col_num] = min_max_scaler.fit_transform(X_train[col_num])
X_test[col_num] = min_max_scaler.transform(X_test[col_num])

undersample = CondensedNearestNeighbour(n_neighbors=1)
X_train, y_train = undersample.fit_resample(X_train, y_train)

xgb_model = XGBClassifier(use_label_encoder=False, learning_rate=0.01, n_estimators=1000)
xgb_model.fit(X_train, y_train)
evaluate_model(xgb_model, X_train, X_test, y_train, y_test, 'Random Forest + CondensedNearestNeighbour')

# Undersampling and Oversampling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(['Churn_Yes'], axis=1), data['Churn_Yes'], test_size = 0.20, random_state = 7, stratify=data['Churn_Yes'].values)
min_max_scaler = MinMaxScaler(feature_range=(0,1))
X_train[col_num] = min_max_scaler.fit_transform(X_train[col_num])
X_test[col_num] = min_max_scaler.transform(X_test[col_num])

undersample = TomekLinks()
X_train, y_train = undersample.fit_resample(X_train, y_train)

sm = SMOTE(random_state = 7)
X_train, y_train = sm.fit_resample(X_train, y_train)

rf = RandomForestClassifier(n_estimators=1000, max_features='sqrt', max_depth=6)
rf.fit(X_train, y_train)
evaluate_model(rf, X_train, X_test, y_train, y_test, 'Random Forest + TomekLinks + SMOTE')

xgb_model = XGBClassifier(use_label_encoder=False, learning_rate=0.01, n_estimators=1000)
xgb_model.fit(X_train, y_train)
evaluate_model(xgb_model, X_train, X_test, y_train, y_test, 'XGB Classifier + TomekLinks + SMOTE')