# Даны данные с пропущенными значениями по нескольким признакам, нужно обработать их и с помощью кастомного классификатора сделать бинарное предсказание. Для проверки есть эти данные без пропущенных значений и с таргетом
более подробно https://yadi.sk/d/eQOKbGoNIyXhPw

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression


def dummy_transform(data):
    """
    Replaces all categorical features in the dataset with dummies
    in a very naive way
    """
    data = data.values
    transformed_data = np.zeros((data.shape[0], 23))
    transformed_data[:, 0] = data[:, 0]
    transformed_data[:, 1:4] = pd.get_dummies(data[:, 1]).values
    transformed_data[:, 4:15] = pd.get_dummies(data[:, 2]).values
    transformed_data[:, 15:19] = data[:, 3:7]
    transformed_data[:, 19:22] = pd.get_dummies(data[:, 7]).values
    transformed_data[:, 22:23] = data[:, 8:9]
    return transformed_data


class CustomClassifier(LogisticRegression):
    """
    Custom Logistic Regression implementation with preset coefficients
    """

    def __init__(self):

        super(CustomClassifier, self).__init__()
        self.coef_ = np.array([[
            -0.21404096, -2.33748762, -2.70184235, -2.73066579, -1.54875568,
            -2.06471249, -1.00291385, -1.43804488, -1.63824906, -1.0017922,
            -0.48928441, -0.62414559, -0.27359805,  0.98648587,  1.32501456,
            -0.80052908, -0.283991,  1.34641143,  0.02758078, -2.6099112,
            -1.63476157, -3.525323,  0.60030625
        ]])
        self.intercept_ = np.array([-7.76999576])
        self.classes_ = np.array([0, 1])

    def predict(self, X):
        assert len(X.shape) == 2, 'Invalid tensor shape'
        if X.shape[1] == 23:
            return super().predict(X)
        else:
            return super().predict(dummy_transform(X))

clf = CustomClassifier()

In [2]:
clean = pd.read_csv('/Users/Admin/Downloads/clean_data_with_target.csv', index_col=0)# test data
spoiled = pd.read_csv('/Users/Admin/Downloads/spoiled_data.csv', index_col=0)# train data with missing values

In [3]:
clean.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,y
0,0.655969,2.0,6.0,-0.032834,-4.062045,12.064858,-4.632454,-1.0,4.0,1.0,0,1
1,10.199212,1.0,2.0,1.351358,-3.571762,9.892446,-1.005013,1.0,1.0,2.0,0,0
2,0.007215,1.0,10.0,-1.488445,-3.946479,10.513221,-8.979408,0.0,3.0,3.0,0,1
3,0.257431,1.0,9.0,-1.523154,-4.450234,11.798011,-6.217044,0.0,6.0,4.0,0,1
4,2.871536,2.0,4.0,-0.532473,-3.484453,12.500063,-3.503076,-1.0,15.0,5.0,0,1


In [4]:
from sklearn.metrics import accuracy_score

In [7]:
prediction = clf.predict(clean.drop(['y','10'], axis = 1, inplace = False))
true_y = clean['y']
print(accuracy_score(prediction, true_y))

0.9234256926952141


In [8]:
list(spoiled.columns)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

In [12]:
for i in list(spoiled.columns):
    if i!='2' and i!='6' and i!='8' and i!='10': print(f"in {i} missed {len(spoiled[spoiled[i] =='-'])}")
    else: print(f"in {i} missed 0")

in 0 missed 199
in 1 missed 198
in 2 missed 0
in 3 missed 202
in 4 missed 190
in 5 missed 199
in 6 missed 0
in 7 missed 215
in 8 missed 0
in 9 missed 210
in 10 missed 0


In [13]:
train  = spoiled

In [14]:
def mean_value_imputer(data, add_binary=False):
    """
    A function for filling missing values in dataset with mean value for each feature.
    :param data: dataset
    :param add_binary: adding additonal columns with mask missing or not
    :return: dataset without missing values
    """
    X = np.array(data)
    mask = X != X

    for col in range(X.shape[1]):
        X[mask[:, col], col] = np.mean(X[~mask[:, col], col])

    if add_binary:
        X = _add_missing_binary(X, mask)

    return X


In [15]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.655968525404552,2.0,6.0,-0.0328336736464172,-4.062045264897834,12.064858107915615,-4.632454,-1.0,4.0,1.0,0
1,10.199211979664126,1.0,2.0,1.351357995650444,-3.5717621073746653,9.89244646012535,-1.005013,1.0,1.0,2.0,0
2,0.0072153716841383,1.0,10.0,-1.4884445239973123,-3.94647901087102,10.513220799618184,-8.979408,0.0,3.0,3.0,0
3,0.2574312093400834,1.0,9.0,-1.5231538101794566,-4.450233535454943,11.7980105575678,-6.217044,0.0,6.0,4.0,0
4,2.871536319644719,2.0,4.0,-0.5324727856834761,-3.4844525300735434,12.500063315341055,-3.503076,-1.0,15.0,5.0,0


In [16]:
train[train['10']==8].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
1585,-,-,5.0,0.17723993863393606,-,9.90018886549262,-3.803903,-1.0,0.0,1.0,8
1586,0.04667945495505864,-,10.0,-,-,-,-8.715818,0.0,0.0,2.0,8
1587,-,2.0,5.0,-1.1260640196914773,-3.67909491943822,-,-3.687618,-1.0,4.0,3.0,8
1588,1.0484621681558046,2.0,7.0,2.2135076634427557,-,-,-4.917433,-,0.0,4.0,8
1589,-,-,10.0,-1.5941691518602998,-,11.878856702666639,-8.857871,0.0,5.0,-,8


In [17]:
train = train.replace('-', np.nan)
train = train.astype(np.float64)

In [19]:
train[train['10']==8].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
1585,,,5.0,0.17724,,9.900189,-3.803903,-1.0,0.0,1.0,8.0
1586,0.046679,,10.0,,,,-8.715818,0.0,0.0,2.0,8.0
1587,,2.0,5.0,-1.126064,-3.679095,,-3.687618,-1.0,4.0,3.0,8.0
1588,1.048462,2.0,7.0,2.213508,,,-4.917433,,0.0,4.0,8.0
1589,,,10.0,-1.594169,,11.878857,-8.857871,0.0,5.0,,8.0


In [20]:
mean_replaced = mean_value_imputer(train)

In [21]:
mean_replaced = pd.DataFrame(mean_replaced)

In [23]:
mean_replaced.columns = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

In [24]:
mean_replaced[mean_replaced['10']==8].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
1585,5.677672,1.470621,5.0,0.17724,-0.175149,9.900189,-3.803903,-1.0,0.0,1.0,8.0
1586,0.046679,1.470621,10.0,0.414441,-0.175149,10.060195,-8.715818,0.0,0.0,2.0,8.0
1587,5.677672,2.0,5.0,-1.126064,-3.679095,10.060195,-3.687618,-1.0,4.0,3.0,8.0
1588,1.048462,2.0,7.0,2.213508,-0.175149,10.060195,-4.917433,-0.026554,0.0,4.0,8.0
1589,5.677672,1.470621,10.0,-1.594169,-0.175149,11.878857,-8.857871,0.0,5.0,115.71493,8.0


In [25]:
for i in ['1', '2', '7', '8', '9', '10']: 
    mean_replaced[i] = mean_replaced[i].astype(np.int)


In [42]:
prediction = clf.predict(mean_replaced[mean_replaced['10']==8].drop('10', axis = 1))
true_y = clean[clean['10']==8]['y']
print(f'accuracy score: {accuracy_score(prediction, true_y)}')

accuracy score: 0.845


In [43]:
(accuracy_score(prediction, true_y)-0.6)/0.25

0.98

In [35]:
# start with simple imputing with mean and find nearest
def _first_imputer(data, mask):

    for col in range(data.shape[1]):
        data[mask[:, col], col] = np.mean(data[~mask[:, col], col])

    data = _round_nearest(data, mask)

    return data
# find nearest in array
def _round_nearest(data, mask):

    for col in range(data.shape[1]):
        uniques = np.unique(data[~mask[:, col], col])
        for row in np.nonzero(mask[:, col])[0]:
            data[row, col] = _find_nearest(uniques, data[row, col])

    return data

def _find_nearest(array, value):
    idx = (np.abs(array - value)).argmin()
    return array[idx]

def svd_imputer(data, rank=None, max_iter=10, tol=1e-1, round_nearest=True, add_binary=False):
    """
    A function for filling missing values in dataset with SVD.
    :param data: dataset
    :param rank: a rank of SVD
    :param max_iter: maximum number of iteration
    :param tol: tolerance of convergence
    :param round_nearest: rounding to the nearest value in array
    :param add_binary: adding additonal columns with mask missing or not
    :return: dataset without missing values
    """

    # https://web.stanford.edu/~hastie/Papers/missing.pdf

    X = np.array(data)
    mask = X != X

    # first inputing by most common/mean
    X = _first_imputer(X, mask)

    # iteratively using svd for best approximation
    for i in range(max_iter):

        U, s, V = np.linalg.svd(X, full_matrices=False)

        if rank:
            s[rank:] = 0

        new_X = U.dot(np.diag(s).dot(V))

        # check convergence
        if np.abs((new_X[mask] - X[mask]) / (X[mask] + 1e-10)).sum() / mask.sum() < tol:
            break
        X[mask] = new_X[mask]

    if round_nearest:
        X = _round_nearest(X, mask)

    if add_binary:
        X = _add_missing_binary(X, mask)

    return X

In [36]:
train_svd = svd_imputer(train)

In [37]:
train_svd = pd.DataFrame(train_svd)

In [38]:
train_svd.columns = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

In [39]:
for i in ['1', '2', '7', '8', '9', '10']: 
    train_svd[i] = train_svd[i].astype(np.int)

In [41]:
prediction = clf.predict(train_svd[train_svd['10']==8].drop('10', axis = 1))
true_y = clean[clean['10']==8]['y']
print(f'accuracy score: {accuracy_score(prediction, true_y)}')

accuracy score: 0.8425


In [44]:
(accuracy_score(prediction, true_y)-0.6)/0.25

0.98