In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import precision_recall_fscore_support, make_scorer, recall_score, f1_score, confusion_matrix, precision_score, balanced_accuracy_score
from sklearn.metrics import fbeta_score, classification_report

from sklearn.datasets import load_boston
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from libs.print_cm import print_cm
from libs.loader import load_malicious, load_jobchange



In [31]:
bost = load_boston()
features = bost.feature_names # ['feature_names']

bost_xy = load_boston(return_X_y=True)

df_boston = pd.DataFrame(bost.data,columns=bost.feature_names)
target = pd.Series(bost.target)
df_boston.head()


def rme_list(df_bost, featues):
    
    for feat in features:
        diff = df_bost[feat] - target
        diff2 = diff * diff
        print(f"Feature {feat}: mse: {diff2.mean()}")
        
rme_list(df_boston, features)

    

Feature CRIM: mse: 577.5147751327091
Feature ZN: mse: 597.7069960474308
Feature INDUS: mse: 322.18144249011857
Feature CHAS: mse: 588.2816996047432
Feature NOX: mse: 568.3793568381225
Feature RM: mse: 339.946211555336
Feature AGE: mse: 3189.8788339920948
Feature DIS: mse: 430.2872685756917
Feature RAD: mse: 389.65640316205537
Feature TAX: mse: 178650.52794466403
Feature PTRATIO: mse: 125.90278656126482
Feature B: mse: 119494.18543083002
Feature LSTAT: mse: 329.62644328063243


In [60]:

class Tracker:
     def __init__(self, feat, m_se, sp1, sp2):
        self.feat = feat
        self.m_se = m_se
        self.sp1 = sp1
        self.sp2 = sp2


r,c = df_boston.shape
print (r,c)

min_tracker = {}

def s_sq_err(df, t):
    err = df - t
    sqerr = err * err
    return sqerr.sum()

for i in range(r-1):
    s1, s2 = i+1, i+2
    top_df = df_boston[:s1]
    bottom_df = df_boston[s2:]
    t_top = target[:s1]
    t_bottom = target[s2:]
    
    for feat in features:
        top = s_sq_err(top_df[feat], t_top)
        bot = s_sq_err(bottom_df[feat], t_bottom)
#         print(f"{feat} {top} {bot}")
        err_combine = top + bot
        
        cur_trac = Tracker(feat, err_combine, s1, s2)
        if feat not in min_tracker:
            min_tracker[feat] = cur_trac
        else:
            old_trac = min_tracker[feat] 
            if old_trac.m_se > cur_trac.m_se:
                min_tracker[feat] = cur_trac
        
# res_df = pd.DataFrame(columns=['feat','min_se'])
rows_list = []
for key in min_tracker:
    trac = min_tracker[key]
    print (trac.feat, trac.m_se)
    dict1 = {}
    
    dict1.update({'feat':trac.feat, 'min_se':trac.m_se}) 
    rows_list.append(dict1)
    
res_df = pd.DataFrame(rows_list)
print(res_df)

min_arg = res_df['min_se'].argmin()
print(f"min_arg {min_arg}")

min_feat = res_df['feat'].loc[min_arg]
trac_min = min_tracker[min_feat]
print(f"min feature: {min_feat} s1: {trac_min.sp1} s2: {trac_min.sp2}")

y_hat_1 = target[:trac_min.sp1].mean()
y_hat_2 = target[trac_min.sp2:].mean()
print(f"y_hat1 {y_hat_1} y_hat2: {y_hat_2} ")


      
            
# top_dfd = df_boston[2:]

# sq_err = s_sq_err(df_boston)
# sq_err.argmin()
# print(sq_err)
# print(sq_err/r)

506 13
CRIM 286048.25701071083
ZN 297761.18
INDUS 160569.5983
CHAS 295170.54
NOX 285139.89375909
RM 169985.082147
AGE 1605053.69
DIS 215337.04190314
RAD 194765.13999999998
TAX 89901551.14
PTRATIO 62337.81
B 60310472.217999995
LSTAT 164460.9874
       feat        min_se
0      CRIM  2.860483e+05
1        ZN  2.977612e+05
2     INDUS  1.605696e+05
3      CHAS  2.951705e+05
4       NOX  2.851399e+05
5        RM  1.699851e+05
6       AGE  1.605054e+06
7       DIS  2.153370e+05
8       RAD  1.947651e+05
9       TAX  8.990155e+07
10  PTRATIO  6.233781e+04
11        B  6.031047e+07
12    LSTAT  1.644610e+05
min_arg 10
min feature: PTRATIO s1: 257 s2: 258
y_hat1 24.433463035019454 y_hat2: 20.45241935483871 


In [14]:
?load_boston

[0;31mSignature:[0m [0mload_boston[0m[0;34m([0m[0;34m*[0m[0;34m,[0m [0mreturn_X_y[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Load and return the boston house-prices dataset (regression).

Samples total               506
Dimensionality               13
Features         real, positive
Targets           real 5. - 50.

Read more in the :ref:`User Guide <boston_dataset>`.

Parameters
----------
return_X_y : bool, default=False
    If True, returns ``(data, target)`` instead of a Bunch object.
    See below for more information about the `data` and `target` object.

    .. versionadded:: 0.18

Returns
-------
data : :class:`~sklearn.utils.Bunch`
    Dictionary-like object, with the following attributes.

    data : ndarray of shape (506, 13)
        The data matrix.
    target : ndarray of shape (506, )
        The regression target.
    filename : str
        The physical location of boston csv dataset.

        .. versionadded:: 0