#### Задача в этом соревновании - предсказать вероятность того, подойдет ли репетитор для подготовки к экзамену по математике. Даны два датасета: train.csv (содержит признаки и целевую переменную) и test.csv (только признаки).

#### Метрика для оценки – ROC AUC

In [78]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import random

#### 1. Изучение датасета

In [2]:
TRAIN_PATH = 'train.csv'
TEST_PATH = 'test.csv'
data = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
data.head(10)

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points,choose
0,0,35.0,0.0,2150.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0,0
1,1,52.0,2.0,1250.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,57.0,1
2,2,29.0,3.0,1750.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,66.0,0
3,3,33.0,3.0,1050.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,66.0,1
4,4,46.0,3.0,2250.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,73.0,0
5,5,37.0,3.0,1050.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,0
6,6,54.0,3.0,800.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,52.0,1
7,7,32.0,2.0,2750.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0,95.0,0
8,8,56.0,3.0,1300.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,58.0,0
9,9,44.0,4.0,2350.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,71.0,0


**Описание датасета**

#### Данные о репетиторах:

* **Id** - идентификатор репетитора
* **age** - возраст
* **years_of_experience** - стаж
* **lesson_price** - стоимость занятия
* **qualification** - квалификация

**преподаваемые предметы:**
* **physics** - физика
* **chemistry** - химия
* **biology** - биология
* **english** - английский язык
* **geography** - география
* **history** - история  

* **mean_exam_points** - средняя оценка за экзамен
* **choose** - пригодность


In [3]:
TARGET_NAME = 'choose'
FEATURE_NAMES = ['age', 'years_of_experience', 'lesson_price', 'qualification',
        'physics', 'chemistry', 'biology', 'english', 'geography', 'history', 'mean_exam_points']

#### типы данных в датасете:

In [4]:
data.dtypes

Id                       int64
age                    float64
years_of_experience    float64
lesson_price           float64
qualification          float64
physics                float64
chemistry              float64
biology                float64
english                float64
geography              float64
history                float64
mean_exam_points       float64
choose                   int64
dtype: object

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
Id                     10000 non-null int64
age                    10000 non-null float64
years_of_experience    10000 non-null float64
lesson_price           10000 non-null float64
qualification          10000 non-null float64
physics                10000 non-null float64
chemistry              10000 non-null float64
biology                10000 non-null float64
english                10000 non-null float64
geography              10000 non-null float64
history                10000 non-null float64
mean_exam_points       10000 non-null float64
choose                 10000 non-null int64
dtypes: float64(11), int64(2)
memory usage: 1015.7 KB


In [6]:
data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,10000.0,4999.5,2886.89568,0.0,2499.75,4999.5,7499.25,9999.0
age,10000.0,45.8009,8.030274,23.0,40.0,46.0,51.0,68.0
years_of_experience,10000.0,1.9748,1.766883,0.0,0.0,2.0,3.0,9.0
lesson_price,10000.0,1702.44,523.789062,200.0,1300.0,1550.0,2150.0,3950.0
qualification,10000.0,1.7243,0.798845,1.0,1.0,2.0,2.0,4.0
physics,10000.0,0.3706,0.48299,0.0,0.0,0.0,1.0,1.0
chemistry,10000.0,0.1215,0.326724,0.0,0.0,0.0,0.0,1.0
biology,10000.0,0.1172,0.321675,0.0,0.0,0.0,0.0,1.0
english,10000.0,0.0591,0.235824,0.0,0.0,0.0,0.0,1.0
geography,10000.0,0.0277,0.16412,0.0,0.0,0.0,0.0,1.0


Из вышеприведенной информации видим, что тренировочный датасет состоит из 10000 строк, пропусков данных нет, минимальные и максимальные значения попадают в допустимые пределы (нет отрицательных или NaN значений).

Для сравнения посмотрим статистику тестовых данных:

In [7]:
test.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,10000.0,14999.5,2886.89568,10000.0,12499.75,14999.5,17499.25,19999.0
age,10000.0,45.9245,8.031977,23.0,41.0,46.0,51.0,68.0
years_of_experience,10000.0,1.9857,1.771217,0.0,0.0,2.0,3.0,9.0
lesson_price,10000.0,1699.91,526.260094,300.0,1300.0,1550.0,2150.0,3950.0
qualification,10000.0,1.7023,0.789644,1.0,1.0,1.5,2.0,4.0
physics,10000.0,0.3721,0.483389,0.0,0.0,0.0,1.0,1.0
chemistry,10000.0,0.1281,0.334218,0.0,0.0,0.0,0.0,1.0
biology,10000.0,0.1158,0.320001,0.0,0.0,0.0,0.0,1.0
english,10000.0,0.049,0.215879,0.0,0.0,0.0,0.0,1.0
geography,10000.0,0.0292,0.168375,0.0,0.0,0.0,0.0,1.0


Посмотрим, какие именно признаки оказывают наибольшее влияние на выбор репетитора:

In [8]:
data.corr()['choose'].sort_values()

lesson_price          -0.134013
history               -0.004700
geography              0.006366
Id                     0.012043
age                    0.017165
english                0.022227
years_of_experience    0.029010
qualification          0.042160
biology                0.071310
chemistry              0.091878
mean_exam_points       0.109409
physics                0.195183
choose                 1.000000
Name: choose, dtype: float64

Видно, что на выбор репетитора наибольшее влияние оказывают преподавание естественных наук (физика, биология, химия), средний балл на экзамене и стоимость занятия.

#### 2. Выбор и реализация методов

Задача по определению пригодности репетитора - классическая задача классификации. Для её решения могут использоваться следующие методы:
* **линейная регрессия**
* **метод опорных векторов**
* **классификация KNN**
* **дерево решений**
* **ансамбль решающих деревьев**
    

Реализуем некоторые из них.

In [61]:
X = data.drop(['Id','choose'], axis=1)
y = data['choose']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# X_train.shape, X_test.shape
# standart_columns = ['age','years_of_experience','lesson_price','mean_exam_points']
#X_train = standart_scaler(X_train,standart_columns)
#X_test = standart_scaler(X_test,standart_columns)

In [79]:
random.seed(42)
def get_bootstrap(data, labels, N):
    n_samples = data.shape[0]
    bootstrap = []
    
    for i in range(N):
        b_data = np.zeros(data.shape)
        b_labels = np.zeros(labels.shape)
        
        for j in range(n_samples):
            sample_index = random.randint(0, n_samples-1)
            b_data[j] = data[sample_index]
            b_labels[j] = labels[sample_index]
        bootstrap.append((b_data, b_labels))
        
    return bootstrap

In [80]:
def get_subsample(len_sample):
    # будем сохранять не сами признаки, а их индексы
    sample_indexes = [i for i in range(len_sample)]
    
    len_subsample = int(np.sqrt(len_sample))
    subsample = []
    
    random.shuffle(sample_indexes)
    for _ in range(len_subsample):
        subsample.append(sample_indexes.pop())
        
    return subsample

In [81]:
class Node:  
    def __init__(self, index, t, true_branch, false_branch):
        self.index = index  # индекс признака, по которому ведется сравнение с порогом в этом узле
        self.t = t  # значение порога
        self.true_branch = true_branch  # поддерево, удовлетворяющее условию в узле
        self.false_branch = false_branch  # поддерево, не удовлетворяющее условию в узле

In [82]:
class Leaf:
    
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels  # y_true
        self.prediction = self.predict()  # y_pred
        
    def predict(self):
        # подсчет количества объектов разных классов
        classes = {}  # сформируем словарь "класс: количество объектов"
        for label in self.labels:
            if label not in classes:
                classes[label] = 0
            classes[label] += 1
        #  найдем класс, количество объектов которого будет максимальным в этом листе и вернем его    
        prediction = max(classes, key=classes.get)
        return prediction    

In [83]:
def gini(labels):
    #  подсчет количества объектов разных классов
    classes = {}
    for label in labels:
        if label not in classes:
            classes[label] = 0
        classes[label] += 1
    
    #  расчет критерия
    impurity = 1     # "impurity" - "нечистота", степень неопределенности
    for label in classes:
        p = classes[label] / len(labels)
        impurity -= p ** 2
        
    return impurity

In [84]:
def quality(left_labels, right_labels, current_gini):

    # доля выборки, ушедшей в левое поддерево
    p = float(left_labels.shape[0]) / (left_labels.shape[0] + right_labels.shape[0])
    
    return current_gini - p * gini(left_labels) - (1 - p) * gini(right_labels)

In [85]:
def split(data, labels, index, t):
    
    left = np.where(data[:, index] <= t)
    right = np.where(data[:, index] > t)
        
    true_data = data[left]
    false_data = data[right]
    true_labels = labels[left]
    false_labels = labels[right]
        
    return true_data, false_data, true_labels, false_labels

In [86]:
def find_best_split(data, labels):
    
    #  обозначим минимальное количество объектов в узле
    min_leaf = 7

    current_gini = gini(labels)

    best_quality = 0
    best_t = None
    best_index = None
    
    n_features = data.shape[1]
    
    for index in range(n_features):
        t_values = [row[index] for row in data]
        
        for t in t_values:
            true_data, false_data, true_labels, false_labels = split(data, labels, index, t)
            #  пропускаем разбиения, в которых в узле остается менее 5 объектов
            if len(true_data) < min_leaf or len(false_data) < min_leaf:
                continue
            
            current_quality = quality(true_labels, false_labels, current_gini)
            
            #  выбираем порог, на котором получается максимальный прирост качества
            if current_quality > best_quality:
                best_quality, best_t, best_index = current_quality, t, index

    return best_quality, best_t, best_index

In [87]:
def build_tree(data, labels, tree_depth=1, max_depth=1):

    quality, t, index = find_best_split(data, labels)

    #прекращаем рекурсию, когда нет прироста в качества
    if quality == 0:
        return Leaf(data, labels)

    true_data, false_data, true_labels, false_labels = split(data, labels, index, t)

    # Рекурсивно строим два поддерева
    true_branch = build_tree(true_data, true_labels)
    false_branch = build_tree(false_data, false_labels)

    # Возвращаем класс узла со всеми поддеревьями, то есть целого дерева
    return Node(index, t, true_branch, false_branch)

In [88]:
def classify_object(obj, node):

    #  Останавливаем рекурсию, если достигли листа
    if isinstance(node, Leaf):
        answer = node.prediction
        return answer

    if obj[node.index] <= node.t:
        return classify_object(obj, node.true_branch)
    else:
        return classify_object(obj, node.false_branch)

In [89]:
def predict(data, tree):
    
    classes = []
    for obj in data:
        prediction = classify_object(obj, tree)
        classes.append(prediction)
    return classes

In [90]:
def tree_vote(forest, data):

    # добавим предсказания всех деревьев в список
    predictions = []
    for tree in forest:
        predictions.append(predict(data, tree))
    
    # сформируем список с предсказаниями для каждого объекта
    predictions_per_object = list(zip(*predictions))
    
    # выберем в качестве итогового предсказания для каждого объекта то,
    # за которое проголосовало большинство деревьев
    voted_predictions = []
    for obj in predictions_per_object:
        voted_predictions.append(max(set(obj), key=obj.count))
        
    return voted_predictions

In [91]:
def random_forest(data, labels, n_trees):
    forest = []
    bootstrap = get_bootstrap(data, labels, n_trees)
    
    for b_data, b_labels in bootstrap:
        forest.append(build_tree(b_data, b_labels))
        
    return forest

In [92]:
# Введем функцию подсчета точности как доли правильных ответов
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

In [94]:
%%time
n_trees = 1
my_forest_1 = random_forest(X_train.values, y_train.values, n_trees)
# Получим ответы для обучающей выборки 
train_answers = tree_vote(my_forest_1, X_train.values)
# Получим ответы для обучающей выборки 
test_answers = tree_vote(my_forest_1, X_test.values)

Wall time: 17min 44s


In [95]:
# Точность на обучающей выборке
train_accuracy = accuracy_metric(y_train.values, train_answers)
print(f'Точность случайного леса из {n_trees} деревьев на обучающей выборке: {train_accuracy:.3f}')
# Точность на тестовой выборке
test_accuracy = accuracy_metric(y_test.values, test_answers)
print(f'Точность случайного леса из {n_trees} деревьев на тестовой выборке: {test_accuracy:.3f}')

Точность случайного леса из 1 деревьев на обучающей выборке: 89.875
Точность случайного леса из 1 деревьев на тестовой выборке: 85.800


In [None]:
%%time
n_trees = 20
my_forest_20 = random_forest(X_train.values, y_train.values, n_trees)
# Получим ответы для обучающей выборки 
train_answers = tree_vote(my_forest_1, X_train.values)
# Получим ответы для обучающей выборки 
test_answers = tree_vote(my_forest_1, X_test.values)

In [None]:
# Точность на обучающей выборке
train_accuracy = accuracy_metric(y_train.values, train_answers)
print(f'Точность случайного леса из {n_trees} деревьев на обучающей выборке: {train_accuracy:.3f}')
# Точность на тестовой выборке
test_accuracy = accuracy_metric(y_test.values, test_answers)
print(f'Точность случайного леса из {n_trees} деревьев на тестовой выборке: {test_accuracy:.3f}')

#### Применение модели:

In [40]:
X_test_predict = test.drop(['Id'], axis=1)
X_test_predict.head()


Unnamed: 0,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points
0,32.0,2.0,2700.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0
1,35.0,6.0,1800.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,71.0
2,44.0,2.0,1200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,45.0
3,44.0,4.0,2950.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,92.0
4,38.0,3.0,1400.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,58.0


In [41]:
test_predict_answers = predict(X_test_predict.values, my_tree)
test_predict_answers
Out_data = pd.DataFrame({'Id':test['Id'], 'choose':test_predict_answers})
Out_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
Id        10000 non-null int64
choose    10000 non-null int64
dtypes: int64(2)
memory usage: 156.3 KB


In [42]:
Out_data.head()

Unnamed: 0,Id,choose
0,10000,0
1,10001,0
2,10002,0
3,10003,0
4,10004,0


In [43]:
Out_data.to_csv('PodoynitsynVA_predictions.csv', index=None)