# Подготовка и изучение датасета

In [None]:
import warnings
warnings.filterwarnings("ignore")

### Загружаем библиотеки 

In [None]:
from pandas import read_csv , DataFrame 
import pandas as pd 
from sklearn.model_selection import train_test_split
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score , recall_score , f1_score
from sklearn.naive_bayes import BernoulliNB
import numpy as np
import scipy.spatial
from math import pi,exp
from collections import Counter
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator,ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, precision_score,roc_auc_score
from sklearn.model_selection import GridSearchCV
import pickle 

# Логистическая регрессия

In [None]:
class LogReg(BaseEstimator, ClassifierMixin):
    def __init__(self,num_iter = 100000):
        self.num_iter=num_iter
        self.beta=1
        
    def fit(self,x,y):
        # x = x.toarray()
        x = x.copy()
        self.beta = np.ones(x.shape[1])
        for i in range(self.num_iter):
            h = self._sigmoid(x, self.beta)
            gradient = self._gradient_descent(x, h, y)
            self.beta =self._weight_update(self.beta, 0.1, gradient)
    
    def _sigmoid(self,X, weight):
        z = np.dot(X, weight)
        return 1 / (1 + np.exp(-z))
    
    def _gradient_descent(self,X, H, Y):
        return np.dot(X.T, (H - Y)) / Y.shape[0]
    
    def _weight_update(self,weight, learning_rate, gradient):
        return weight - learning_rate * gradient
    
    
    def predict(self,test):
        # test = test.toarray()
        final_result=[]
        
        result = self._sigmoid(test, self.beta)
        
        for i in result:
            final_result.append(self._onepred(i))
        
        
        return final_result
        
        
        
    def _onepred(self,x):
        if x < 0.5:
            return 0
        else:
            return 1

# SVM

In [None]:
class MYSVM(BaseEstimator, ClassifierMixin):
   
    def __init__(self, etha=0.1, alpha=0.2, epochs=990):
        self.epochs = epochs
        self.etha = etha
        self.alpha = alpha
        self.w = None
        
        
    def fit(self, X_train, Y_train):
        # X_train = X_train.toarray()
        X_test = X_test.copy()
        
        for i in range(len(Y_train)):
            if Y_train.iloc[i] == 0:
                Y_train.iloc[i] = -1
        
        X_train = self._add_bias_feature(X_train)
        self.w = np.random.normal(loc=0, scale=0.05, size=X_train.shape[1])#задаем первые веса
        
        
        
        for epoch in range(self.epochs): 
            
            for i,x in enumerate(X_train):
                margin = Y_train.iloc[i]*np.dot(self.w,X_train[i])
                if margin >= 1: 
                    self.w = self.w - self.etha*self.alpha*self.w/self.epochs
                    
                else: 
                    self.w = self.w +\
                    self.etha*(Y_train.iloc[i]*X_train[i] - self.alpha*self.w/self.epochs)
                    
        Y_train[Y_train == -1] = 0
     
    
    def _add_bias_feature(self,a):
        
        a_extended = np.zeros((a.shape[0],a.shape[1]+1))
        a_extended[:,:-1] = a
        a_extended[:,-1] = int(1)  
        return a_extended
    
    
    def predict(self, X):
        # X = X.toarray()
        y_pred = []
        
        for i in range(len(X)):
            y_pred.append(np.sign(1+np.dot(self.w[1:],X[i])))
        for i in range(len(y_pred)):
            if y_pred[i]==-1:
                y_pred[i]=0

        return y_pred         
    

### Алгоритм k ближайших соседей 

In [None]:
class KNN(BaseEstimator, ClassifierMixin):
    
    def __init__(self, k=3,h=1):
        self.h=h
        self.k = k
    
      
    def fit(self, X, y):
        # X = X.toarray()
        X = pd.DataFrame(X)
        self.X_train = X
        self.y_train = y
        
   
    def _jadro_K(self,z):    
        return ((2*pi)**(-0.5))*exp(-0.5*z**(2))
    
    def predict(self, X_test):
        # X_test = X_test.toarray()
        X_test = X_test.copy()
        X_test = pd.DataFrame(X)
        output = []#
        for i in range(len(X_test)):
            d = []
            votes = []
            for j in range(len(X_train)):
                
                dist = scipy.spatial.distance.euclidean(self.X_train.iloc[j] , X_test.iloc[i])
                 
                weight=self._jadro_K(scipy.spatial.distance.euclidean(self.X_train.iloc[j] , X_test.iloc[i])/self.h)
                d.append([dist, j,weight])
            
            d.sort()
            d = d[0:self.k]
            zero_score=0
            one_score=0
            for a, j, k in d:
                votes.append(y_train.iloc[j])
            for j in range(len(votes)):
                if votes[j]==0:
                    zero_score=zero_score+1*d[j][2]
                if votes[j]==1:
                    one_score=one_score+1*d[j][2]
            if zero_score>one_score:
                ans=0
            if  one_score>zero_score:
                ans=1
            output.append(ans)
        return output
    

### Наивный байесовский классификатор

In [None]:
class NaivBaisClassificator(BaseEstimator, ClassifierMixin):
    
   
    def __init__(self):
        self.out={}
    
    def _make_likelihood_Table(self,labels_and_serie):
        a=labels_and_serie[labels_and_serie['churn']==1][labels_and_serie.keys()[1]].value_counts()/len(labels_and_serie[labels_and_serie['churn']==1][labels_and_serie.keys()[1]])
        b=labels_and_serie[labels_and_serie['churn']==0][labels_and_serie.keys()[1]].value_counts()/len(labels_and_serie[labels_and_serie['churn']==0][labels_and_serie.keys()[1]])
        out_dataframe=DataFrame()
        if set(a.keys())==set(b.keys()):
        
            out_dataframe[str(a.name)+'_1']=a
            out_dataframe[str(b.name)+'_0']=b
            return out_dataframe
        if len(set(a.keys())-set(b.keys()))!=0:
            for m in list(set(a.keys())-set(b.keys())):
                b[m]=1/len(labels_and_serie[labels_and_serie['churn']==0][labels_and_serie.keys()[1]])
        if len(set(b.keys())-set(a.keys()))!=0:
            for m in list(set(b.keys())-set(a.keys())):
                a[m]=1/len(labels_and_serie[labels_and_serie['churn']==1][labels_and_serie.keys()[1]])
        out_dataframe[str(a.name)+'_1']=a
        out_dataframe[str(b.name)+'_0']=b
        return out_dataframe
    
    
    def fit(self,train,test):
        # train = train.toarray()
        train = pd.DataFrame(train.copy())
        data=DataFrame()
        data=train.copy()
        data['Survived']=test.values
        dict_of_df={}
        for i in data.columns[0:-1]:
            dict_of_df[i]=self._make_likelihood_Table(data[['Survived',i]])
        self.out=dict_of_df
    
    
    def _onepredict(self,test):
        zero=1
        one=1
        for i in test.keys():
            
            try:
                one=one*float(self.out[i][i+'_1'][test[i]])
                zero=zero*float(self.out[i][i+'_0'][test[i]])
            except:
                one=one*1
                zero=zero*1
        if zero>one:
        
            return 0
        else:
        
            return 1
        

    def predict(self,test_dataset):
        # test_dataset = test_dataset.toarray()
        test_dataset = pd.DataFrame(test_dataset.copy())
        final_predict=[]
        for k in range(test_dataset.shape[0]):
            final_predict.append(self._onepredict(test_dataset.iloc[k]))
        return final_predict
    

# Pipeline

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder


trans_pipeline = ColumnTransformer([
    
])

In [None]:
dataset = read_csv('data/transformed_train.csv', index_col=0)

In [None]:
def metrics(pred,true,name):
    print(f'metrics for {name}\n\n\n')
    print('confusion_matrix = \n',confusion_matrix(pred,true),'\n\n\n')
    print('accuracy_score = ',accuracy_score(pred,true))
    print('recall_score = ',recall_score(pred,true))
    
    try:
        print('precision_score=', precision_score(pred,true))
    except ValueError:
        pass
        print('precision_score=', precision_score(pred,true))

In [None]:
import os.path

def work_with_model(model,params,name):
    model = model()
    model = Pipeline([('transformer',trans_pipeline),('clf',model)])
    grid = GridSearchCV(estimator=model,param_grid =params,cv=5 )  
    grid.fit(dataset.drop(['churn'],axis=1),dataset['churn'])
    with open(os.path.join("params", name+'_best_params.txt'), 'a') as f:
        f.write(str(grid.best_estimator_))
    
    pkl_filename = name+'_best_model.pkl'
    
    with open(os.path.join("params", pkl_filename), 'wb') as file: 
        pickle.dump(grid.best_estimator_, file)
        
    metrics(grid.predict(dataset.drop(['churn'],axis=1)),dataset['churn'],name)

### Логистическая регрессия 

In [None]:
parameters  = {'clf__num_iter':[100,500,1000,2000,3000,5000,10000]}

work_with_model(LogReg,parameters,'custom_logreg')

In [None]:
parameters  = {'clf__max_iter':[100,500,1000,2000,3000,5000,10000]}

work_with_model(LogisticRegression,parameters,'sklearn_logreg')

# SVM

In [None]:
parameters  = {'clf__etha':[0.1,0.2,0.3],
               'clf__alpha':[0.1,0.2,0.3], 
               'clf__epochs':[100,500,1000]}

work_with_model(MYSVM,parameters,'custom_svm')

In [None]:
parameters  = {'clf__C':[0.1,0.2,0.3],
               'clf__gamma':[0.1,0.2,0.3], 
               'clf__max_iter':[100,500,1000]}

work_with_model(svm.SVC,parameters,'sklearn_svm')

# Алгоритм k ближайших соседей

In [None]:
parameters  = {'clf__k':[2,3,4],
               'clf__h':[1,0.1,0.01], 
               }

work_with_model(KNN,parameters,'custom_KNN')

In [None]:
parameters  = {'clf__n_neighbors':[2,3,4],
               'clf__weights':['uniform', 'distance']
               }

work_with_model(KNeighborsClassifier,parameters,'sklearn_KNN')

# Наивный байесовский классификатор

In [None]:
parameters  = {}

work_with_model(NaivBaisClassificator,parameters,'custom_NaivBaisClassificator')

In [None]:
parameters  = {}

work_with_model(BernoulliNB,parameters,'sklearn_NaivBaisClassificator')