In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
df = pd.read_csv('./data/train.csv')
df = df.sample(frac=1).reset_index(drop=True)

In [9]:

df_train = df.sample(frac=1)
features=['age', 'years_of_experience', 'lesson_price', 'qualification',
       'physics', 'chemistry', 'biology', 'english', 'geography', 'history',
       'mean_exam_points']
bool_features=['physics', 'chemistry', 'biology', 'english', 'geography', 'history']
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 2411 to 7511
Data columns (total 13 columns):
Id                     10000 non-null int64
age                    10000 non-null float64
years_of_experience    10000 non-null float64
lesson_price           10000 non-null float64
qualification          10000 non-null float64
physics                10000 non-null float64
chemistry              10000 non-null float64
biology                10000 non-null float64
english                10000 non-null float64
geography              10000 non-null float64
history                10000 non-null float64
mean_exam_points       10000 non-null float64
choose                 10000 non-null int64
dtypes: float64(11), int64(2)
memory usage: 1.1 MB


In [10]:
class Leaf:
    
    def __init__(self, data, label_feature):
        self.prediction = self.__predict(data, label_feature)
        
    def __predict(self, data, label_feature):
        classes = data[label_feature].value_counts()
        if (1 in classes)==False:
            return 0
        return classes[1]/len(data)      

class Tree: 
    def __init__(self):
        self.tree = []
        
    def fit(self, data, label_feature, features, bool_features, min_leaf=5):
        self.features = features.copy()
        self.min_leaf = min_leaf
        self.label_feature=label_feature
        self.bool_features = bool_features
        self.tree = self.__build_tree(data)
        
    def load(self, tree):
        self.tree = tree
        
    def __split(self, data, feature, t):

        left=pd.DataFrame()
        right=pd.DataFrame()

        for index, row in data.iterrows():
            if row[feature] <= t:
                left=left.append(row)
            else:
                right=right.append(row)
                
        return left, right
    
    def __gini(self, labels):
        
        #  подсчет количества объектов разных классов
        classes = {}
        
        len_labels=len(labels)
        classes=labels.value_counts()
        
        impurity = 1     
        for cnt in classes:
            p = cnt / len_labels
            impurity -= p ** 2

        return impurity
    
    def __find_best_split(self, data, features, current_gini=None):
        
        if current_gini is None:
            current_gini = self.__gini(data[self.label_feature])
        

        best_quality = 0
        best_t = None
        best_feature = None
        best_left = None
        best_right = None
        best_left_gini = None
        best_right_gini = None
        
        for feature in features:
           
            if feature in self.bool_features:
                t_values = [0]
            else:
                t_values = data[feature].unique()
            
            
            for t in t_values:
                left, right =  self.__split(data, feature, t)
                
                #  пропускаем разбиения, в которых в узле остается менее 5 объектов
                if len(left) < self.min_leaf or len(right) < self.min_leaf:
                    continue

                    
                p = len(left) / len(data)
                left_gini = self.__gini(left[self.label_feature])
                right_gini = self.__gini(right[self.label_feature])
                current_quality=current_gini - p * left_gini - (1 - p) * right_gini

                
                #  выбираем порог, на котором получается максимальный прирост качества
                if current_quality > best_quality:
                    best_quality, best_t, best_feature, best_left, best_right, best_left_gini,best_right_gini  = current_quality, t, feature, left, right, left_gini, right_gini
                    
        return best_quality, best_t, best_feature, best_left, best_right, best_left_gini, best_right_gini
    
    def __build_tree(self, data, features = None, gini = None, branch = "center", level = 0):
       
        if features is None:
            features = self.features
            
        features = features.copy()
        if len(data) > 500:
            quality, t, feature, left, right, left_gini, right_gini = self.__find_best_split(data.sample(n=300), features, gini)
            if feature is not None:
                quality, t, feature, left, right, left_gini, right_gini = self.__find_best_split(data, [feature], gini)
            else:
                quality, t, feature, left, right, left_gini, right_gini = self.__find_best_split(data, features, gini)
        else:
            quality, t, feature, left, right, left_gini, right_gini = self.__find_best_split(data, features, gini)

        #  Базовый случай - прекращаем рекурсию, когда нет прироста в качества
        if quality == 0:
            _leaf = Leaf(data, self.label_feature)
            return {"c":1, "p":_leaf.prediction}
        
        if feature in self.bool_features:
            del features[features.index(feature)]
        #print(("-"*level), branch, feature, t, len(left), len(right))
        
        # Рекурсивно строим два поддерева
        true_branch = self.__build_tree(left, features, left_gini, "left", level+1)
        false_branch = self.__build_tree(right, features, right_gini, "right", level+1)

        # Возвращаем класс узла со всеми поддеревьями, то есть целого дерева
        return {"c":0, "f":feature, "p": t, "l":true_branch, "r":false_branch}
        
    def classify_object(self, obj, node = None):
        
        if node is None:
            node = self.tree
        #  Останавливаем рекурсию, если достигли листа
        if node["c"]==1:
            return node["p"]
        
        if obj[node["f"]] <= node["p"]:
            return self.classify_object(obj, node["l"])
        else:
            return self.classify_object(obj, node["r"])


In [11]:
# %%time
# train_data, test_data = train_test_split(df_train,test_size = 0.1,random_state = 1)
# train_data.shape, test_data.shape



CPU times: user 2.54 ms, sys: 865 µs, total: 3.41 ms
Wall time: 3.65 ms


((9000, 13), (1000, 13))

In [49]:
%%time
import random

class Forest:
    def __init():
        self.fits=[]
        
    def load(self, jsondata):
        self.fits=[]
        for jsontree in jsondata:
            _Tree = Tree()
            _Tree.load(jsontree)
            self.fits.append(_Tree)
            
    def fit(self, trees_count, in_trees_samples, rand_features_count, train_data, features, target_feature, bool_features, min_leaf):
        
        fits=[]
        while trees_count > 0:
            train=train_data.sample(n=in_trees_samples)
            tree = Tree()
            
            rand_features = random.sample(features, rand_features_count)
            
            tree.fit(train, target_feature, rand_features, bool_features, min_leaf=3)
            fits.append(tree)
            trees_count -=1
            
        self.fits = fits
        
    def predict(self, test_data):
        answers=[]
        for index, row in test_data.iterrows():
            prediction = 0
            for f in self.fits:
                prediction += f.classify_object(row)
            prediction=prediction/len(self.fits)
            answers.append(prediction)
        return answers

CPU times: user 42 µs, sys: 83 µs, total: 125 µs
Wall time: 93 µs


In [59]:
%%time

_Forest = Forest()
_Forest.fit(100, 100, 5, train_data, features, "choose", bool_features, min_leaf=3)



CPU times: user 24min 16s, sys: 5.31 s, total: 24min 22s
Wall time: 24min 49s


In [60]:
from sklearn.metrics import roc_auc_score

predictions = _Forest.predict(train_data)
tree_train_score = roc_auc_score(list(train_data["choose"]), predictions)
print( f'train - {tree_train_score}')

predictions = _Forest.predict(test_data)
tree_train_score = roc_auc_score(list(test_data["choose"]), predictions)
print( f'test - {tree_train_score}')

train - 0.8066877184803392
test - 0.84085776179941


In [None]:
# import json 

# jsons_trees=[]
# for Tree in _Forest.fits:
#     jsons_trees.append(Tree.tree)
    
# with open('fits.json', 'w') as outfile:
#     json.dump(jsons_trees, outfile)


In [61]:
df_main_test = pd.read_csv('./data/test.csv')
df_main_test['choose']=_Forest.predict(df_main_test)
df_main_test.loc[:, ['Id', 'choose']].to_csv('TPolunina_predictionsRndF2.csv', index=None)

In [None]:
# predictions = _F.predict(test_data)
# tree_train_score = roc_auc_score(list(test_data["choose"]), predictions)
# print( f'test - {tree_train_score}')