In [15]:
from math import sqrt
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore') 

from scipy import stats

import statsmodels.api as sm
import statsmodels.stats.api as sms
from statsmodels.stats.outliers_influence import OLSInfluence

from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import shapiro
from scipy.stats import zscore
from pingouin import partial_corr

In [16]:
df = pd.read_csv(r"data/tum_preprocessing.csv")

In [17]:
label= 'Результа ты в бенчмарке PCMark10'

In [18]:
X_data = df.drop(label, axis=1)
Y_data = df.drop(X_data.columns, axis=1)

In [38]:
def get_pivot(X, model, param, y, num):
    pivot_test = pd.DataFrame()
    predictors = model.params.index.tolist()
    n = len(predictors)
    i = np.ones(n).astype(int) * num
    pivot_test['Шаг'] = pd.Series(i)
    pivot_test['Параметр'] = predictors
    pivot_test['b'] = model.params.values.round()
    pivot_test['Стандартная ошибка'] = model.bse.values.round()
    zmodel = sm.OLS(zscore(y), zscore(X[param])).fit()
    pivot_test['Стандартизированные коэффициенты'] = np.concatenate(([np.NAN], zmodel.params.values.round(3)))
    pivot_test['t'] = model.tvalues.values.round(2)
    pivot_test['Нижняя граница'] = model.conf_int(alpha=0.05)[:][0].values.round()
    pivot_test['Верхняя граница'] = model.conf_int(alpha=0.05)[:][1].values.round()
    corr = pd.concat((X[param],y), axis=1).pcorr().round(3)['Результа ты в бенчмарке PCMark10']
    corr.pop('Результа ты в бенчмарке PCMark10')
    pivot_test['Частная корреляция'] = np.concatenate(([np.NAN],corr.values))
    return pivot_test.set_index('Шаг')

In [39]:
def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=3.84, 
                       threshold_out = 2.71, 
                       verbose=True):
    included = list(initial_list)
    i = 1
    pivot_df = pd.DataFrame()
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_tval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_tval[new_column] = model.tvalues[new_column]
            # print(new_pval)
        best_tval = new_tval.max()
        if best_tval ** 2 >= threshold_in:
            best_feature = new_tval.argmax()
            print(new_tval.index[best_feature])
            included.append(new_tval.index[best_feature])
            changed=True
            if verbose:
                print('Add  {:30} with t-value {:.6}'.format(best_feature, best_tval))
                # model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        model_info = get_pivot(X, model, included, y, i)
        pivot_df = pd.concat([pivot_df, model_info], axis=0)
        i += 1
        # use all coefs except intercept
        tvalues = model.tvalues.iloc[1:]
        worst_tval = tvalues.min() # null if pvalues is empty
        if worst_tval ** 2 <= threshold_out:
            changed=True
            worst_feature = tvalues.argmin()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with t-value {:.6}'.format(worst_feature, worst_tval))
                model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
                model_info = get_pivot(X, model, included, y, i)
                pivot_df = pd.concat([pivot_df, model_info], axis=0)
                i += 1
        if not changed:
            break
    return included, pivot_df

result, res_df = stepwise_selection(X_data, Y_data)

print('resulting features:')
print(result)

Количество ядер
Add                              13 with t-value 10.1352
Объем оперативной памяти (Гб)
Add                               7 with t-value 2.7348
Тактовая частота процессора(ГГц)
Add                               8 with t-value 2.49875
Объема кэша L2 процессора(Мб)
Add                               5 with t-value 2.08742
resulting features:
['Количество ядер', 'Объем оперативной памяти (Гб)', 'Тактовая частота процессора(ГГц)', 'Объема кэша L2 процессора(Мб)']


In [None]:
def get_pcorr()

In [69]:
def stepwise_selection(X, y, 
                       threshold_in=3.84, 
                       threshold_out = 2.71):
    included = list()    
    i = 1
    pivot_df = pd.DataFrame()
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_tval = pd.Series(index=excluded)
        pcors = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_tval[new_column] = model.tvalues[new_column]
            pcors[new_column] = np.abs(pd.concat((X[included+[new_column]], y), axis=1).pcorr()['Результа ты в бенчмарке PCMark10'][new_column])
        best_tval = new_tval[pcors.argmax()]

        if best_tval ** 2 >= threshold_in:
            best_feature = new_tval.argmax()
            included.append(new_tval.index[best_feature])
            changed=True

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        model_info = get_pivot(X, model, included, y, i)
        pivot_df = pd.concat([pivot_df, model_info], axis=0)
        i += 1

        tvalues = model.tvalues.iloc[1:]
        worst_tval = tvalues.min()
        if worst_tval ** 2 <= threshold_out:
            changed=True
            worst_feature = tvalues.argmin()
            included.remove(worst_feature)
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
            model_info = get_pivot(X, model, included, y, i)
            pivot_df = pd.concat([pivot_df, model_info], axis=0)
            i += 1
        if not changed:
            break
    return included, pivot_df

result, res_df = stepwise_selection(X_data, Y_data)

print('resulting features:')
print(result)

Количество ядер
Объем оперативной памяти (Гб)
Тактовая частота процессора(ГГц)
Объема кэша L2 процессора(Мб)
resulting features:
['Количество ядер', 'Объем оперативной памяти (Гб)', 'Тактовая частота процессора(ГГц)', 'Объема кэша L2 процессора(Мб)']


In [71]:
res_df

Unnamed: 0_level_0,Параметр,b,Стандартная ошибка,Стандартизированные коэффициенты,t,Нижняя граница,Верхняя граница,Частная корреляция
Шаг,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,const,787.0,212.0,,3.72,351.0,1222.0,
1,Количество ядер,627.0,62.0,0.897,10.14,499.0,754.0,0.897
2,const,782.0,189.0,,4.15,393.0,1171.0,
2,Количество ядер,529.0,66.0,0.756,8.04,393.0,664.0,0.854
2,Объем оперативной памяти (Гб),476.0,174.0,0.257,2.73,117.0,835.0,0.487
3,const,82.0,328.0,,0.25,-596.0,761.0,
3,Количество ядер,532.0,60.0,0.761,8.93,409.0,655.0,0.881
3,Объем оперативной памяти (Гб),457.0,158.0,0.247,2.89,130.0,784.0,0.517
3,Тактовая частота процессора(ГГц),328.0,131.0,0.179,2.5,56.0,599.0,0.462
4,const,-35.0,311.0,,-0.11,-681.0,610.0,


In [22]:
model = sm.OLS(Y_data, sm.add_constant(df[result])).fit()

In [None]:
summary = pd.DataFrame()
summary['R^2'] = model.rsquared
summary['Скорректированный R^2'] = model.rsquared_adj
summary['F'] = model.fvalue
summary['Стандартная ошибка оценки'] = model.