In [300]:
import math

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.datasets import make_classification


df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')


In [294]:

# df.astype(object).eq(' ').sum()

In [295]:

df['TotalCharges'].replace({" ": np.nan}, inplace=True)
df['TotalCharges'] = df['TotalCharges'].astype(np.float64)
df['TotalCharges'].fillna((df['TotalCharges'].mean()), inplace=True)

In [296]:
enc_list = list([
 'gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'Churn'
])

df = pd.get_dummies(df, columns=enc_list, drop_first=True)

df.insert(0, 'bias', 1.0)



In [297]:
df = df.select_dtypes([np.number])

ss = StandardScaler()
df.iloc[:,0:-1] = ss.fit_transform(df.iloc[:,0:-1])

dataset = df.to_numpy()
X = dataset[:, :-1]
Y = dataset[:, -1]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2,random_state=10)

y_train = np.array([1 if it > 0 else -1 for it in y_train])
y_test = np.array([1 if it > 0 else -1 for it in y_test])


In [None]:

feature_columns = list(df.columns[:-1])
features = pd.DataFrame(df, columns=feature_columns)

coeff_df = pd.DataFrame(mutual_info_classif(df[:,:-1], df[:,-1]).reshape(-1, 1),
                         columns=['Coefficient'], index=feature_columns)


In [298]:
def g(x):
  return np.tanh(x)

def loss(y, y_p):
    cost = 0.5 * np.sum((y-y_p)**2) / y.shape[0]
    return cost

def logistic_regression(x, y, k, threshold):
    x_sampled = x[:,0:k]
    w = np.random.rand(k)
    alpha = 0.0001
    itr = 0
    while True:
        itr += 1
        y_p = g(np.matmul(x_sampled, w))
        cost = loss(y, y_p)

        if cost <= threshold:
            break

        w = w + (alpha * np.matmul(x_sampled.T, ((y - y_p) * (1 - y_p**2))))

    # print(itr)
    return w


def train_logistic():
    k = 31
    w_ret = logistic_regression(x_train, y_train, k, 0.50)
    x_test_sub = x_test[:,0:k]
    y_pred = g(np.matmul(x_test_sub, w_ret))
    y_pred = [1 if it > 0 else -1 for it in y_pred]
    accuracy = np.sum((y_test == y_pred))/y_test.shape[0]
    return accuracy

# train_logistic()



In [299]:
def ada_boost(x_examples, y_examples, k_val, feature_sz):
    n = x_examples.shape[0]
    w = np.empty(n)
    w.fill(1.0/n)
    h = []
    z = []

    x_sliced = x_examples[:,0:feature_sz]

    for k in range(0, k_val):
        index = np.random.choice(x_examples.shape[0], n, replace=True, p = w)
        x_data = x_examples[index]
        y_data = y_examples[index]

        x_data = x_data[:,0:feature_sz]

        w_lr = logistic_regression(x_data, y_data, feature_sz, 0.5)

        y_pred = g(np.matmul(x_sliced, w_lr))
        y_pred = np.array([1 if it > 0 else -1 for it in y_pred])

        error = 0.0

        for j in range(0, n):
            if y_pred[j] != y_examples[j]:
                error += w[j]

        # print(error)

        if error > 0.5:
            continue

        for j in range(0, n):
            if y_pred[j] == y_examples[j]:
                w[j] = w[j]*(error/(1-error))


        w = w/sum(w)

        h.append(w_lr)
        z.append(math.log2((1-error)/error))

    return np.array(h), np.array(z)



def train_ada():
    feature_sz = 31
    k_val = 5
    h, w = ada_boost(x_test,y_test, k_val, feature_sz)

    # print(w)

    x_data = x_test[:,0:feature_sz]

    # print(x_data.shape)
    # print(y_test.shape)
    # print(h.shape)
    # print(w.shape)

    y_pred = np.matmul(np.matmul(x_data, h.T), w)
    y_pred = np.array([1 if it > 0 else -1 for it in y_pred])

    accuracy = np.sum((y_test == y_pred))/y_test.shape[0]
    return accuracy



train_ada()


[0.7342358  1.04732687 0.14961193 0.33115628 0.22808632]


0.6969481902058198