In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as sp

In [2]:
tool_62 = pd.read_csv('./tool_62.csv')
tool_66 = pd.read_csv('./tool_66.csv')

In [3]:
split_62 = [pd.read_csv(f'./merged_split/62/{i}.csv') for i in range(31)] # 마지막 한개는 all cycle이 아님
split_66 = [pd.read_csv(f'./merged_split/62/{i}.csv') for i in range(27)]

### 온갖 함수

In [4]:
def filling(dataframe:pd.DataFrame):
    temp = dataframe.copy()
    temp.interpolate('linear',inplace=True)
    temp.fillna('bfill',inplace=True)
    temp.fillna('ffill',inplace=True)
    return temp

In [12]:
import pandas as pd
from scipy.stats import ttest_ind, levene
import numpy as np

def perform_t_tests(df1, df2):
    """
    두 데이터프레임의 모든 열에 대해 독립 표본 t-검정을 수행하는 함수입니다.

    Parameters:
    df1 (pandas.DataFrame): 첫 번째 데이터프레임
    df2 (pandas.DataFrame): 두 번째 데이터프레임

    Returns:
    results (pandas.DataFrame): 각 열에 대한 t-검정 결과를 포함하는 데이터프레임
    """
    # 결과를 저장할 빈 리스트를 초기화합니다.
    results = []

    # df1과 df2의 모든 열에 대해 반복합니다.
    for column in df1.columns:
        # 해당 열의 데이터를 추출합니다.
        data1 = df1[column]
        data2 = df2[column]
        data1 =filling(data1)
        data2=filling(data2)
        
        # 데이터가 충분한 경우에만 t-검정을 수행
        if len(data1) > 1 and len(data2) > 1:
            # F-테스트를 사용하여 분산 동일성 검사
            _, p_value_levene = levene(data1, data2)
            equal_var = p_value_levene > 0.05

            # 독립 표본 t-검정 수행
            t_statistic, p_value = ttest_ind(data1, data2, equal_var=equal_var)
            results.append({'Column': column, 'P': "{:.4e}".format(p_value)})
        else:
            # 데이터가 충분하지 않은 경우 결과 추가
            results.append({'Column': column, 'P': np.nan})

    # 결과 리스트를 데이터프레임으로 변환
    results_df = pd.DataFrame(results)

    # 결과 리스트를 데이터프레임으로 변환합니다.
    results_df = pd.DataFrame(results).transpose()
    results_df.columns = results_df.iloc[0,:]
    results_df.drop('Column',inplace=True)
    results
    return results_df

In [13]:
def convert_objects_to_float(df):
    """
    데이터프레임의 모든 object 타입 컬럼을 float 타입으로 변환합니다.

    Parameters:
    df (pd.DataFrame): 입력 데이터프레임

    Returns:
    pd.DataFrame: 타입이 변환된 데이터프레임
    """
    for column in df.columns:
        if df[column].dtype == 'object':
            # 숫자로 변환할 수 없는 값을 포함한 경우 오류가 발생할 수 있음
            try:
                df[column] = df[column].astype(float)
            except ValueError:
                print(f"컬럼 '{column}'을 float로 변환할 수 없습니다.")
    return df

![image.png](attachment:image.png)

In [14]:
def rms(x):
  return np.sqrt(np.mean(x**2))

def Feature_Make(dataframe:pd.DataFrame):
  '''
  데이터프레임의 통계적인 모든 특징을 담아내는 함수
  '''
  data = filling(dataframe)
  Max = np.max(data)
  Min = np.min(data)
  Mean = np.mean(data)
  Rms = rms(data)
  Std = np.std(data)
  Skew =sp.skew(data)
  Kurt = sp.kurtosis(data)
  if Rms == 0:
    Cf = np.NaN
  else:
    Cf = Max/Rms
  if Mean==0:
    If = np.NaN
  else:
    If = Rms/Mean
  if Mean  == 0:
    Sf = 0
  else:
    Sf = Max/Mean
  Median = np.median(data)
  Mode = sp.mode(data)[0]
  q1 = np.quantile(data,0.25)
  q3 = np.quantile(data,0.75)
  Iqr = q3-q1
  List = [Max,Min,Mean,Rms,Std,Skew,Kurt,Cf,If,Sf,Median,Mode,q1,q3,Iqr]
  Domain = pd.DataFrame(List,index=['Max','Min','Mean','Rms','Std','Skew',\
                        'Kurt','Cf','If','Sf','Median','Mode','q1','q3','Iqr'])
  Domain = Domain.transpose()
  return Domain

In [15]:
def update_diff(df, threshold):
    """
    데이터프레임의 모든 컬럼에 대해 각 값이 주어진 임계값보다 작은 경우 'Diff', 나머지는 'NOPE'으로 변경합니다.

    Parameters:
    df (pd.DataFrame): 입력 데이터프레임
    threshold (float): 비교할 임계값

    Returns:
    pd.DataFrame: 업데이트된 데이터프레임
    """
    for column in df.columns:
        df[column] = df[column].apply(lambda x: 'Diff' if x < threshold else 'NOPE')
    return df
p_value = 0.05

### 62

In [16]:
jump  = 60
sec = 120
columns_to_delete = ["Active_Power_W1", "Active_Power_W2", "Active_Power_W3", "Apparent_Power_Va1",
                     "Apparent_Power_Va2", "Line_Current_L1", "Line_Current_L2", "Line_Current_L3",
                     "Line_Voltage_V23", "Line_Voltage_V31"]
columns = list(split_62[0].columns)
selected_columns = [item for item in columns if item not in columns_to_delete]
del selected_columns[0]
stat_list_front = [pd.DataFrame() for i in range(len(selected_columns))]
stat_list_second = [pd.DataFrame() for i in range(len(selected_columns))]
stat_list_third = [pd.DataFrame() for i in range(len(selected_columns))]
stat_list_back = [pd.DataFrame() for i in range(len(selected_columns))]
stat_list_back_second = [pd.DataFrame() for i in range(len(selected_columns))]


for dataframe in split_62:
    data = dataframe[selected_columns]
    front = data.iloc[jump: jump+sec,:]
    second = data.iloc[jump+sec: jump+sec*2,:]
    third = data.iloc[jump+sec*2: jump+sec*3,:]
    back = data.iloc[-jump-sec:-jump,:]
    back_second = data.iloc[-jump-sec*2:-jump-sec*1,:]
    for i, column in enumerate(selected_columns): # 앞 부분과 뒷 부분의 통계치 비교 하기 
        stat_list_front[i] = pd.concat([stat_list_front[i],Feature_Make(front[column])],axis=0)
        stat_list_back[i] = pd.concat([stat_list_back[i],Feature_Make(back[column])],axis=0)
        stat_list_second[i] = pd.concat([stat_list_second[i],Feature_Make(second[column])],axis=0)
        stat_list_third[i] = pd.concat([stat_list_third[i],Feature_Make(third[column])],axis=0)
        stat_list_back_second[i] = pd.concat([stat_list_back_second[i],Feature_Make(back_second[column])],axis=0)


['Apparent_Power_Va3', 'Line_Voltage_V12', 'x', 'y', 'z']


  Skew =sp.skew(data)
  Kurt = sp.kurtosis(data)


In [17]:
def total_p_value(List1:list,List2:list):
    '''
    두 부분에 대하여 각 열별 p_value를 도출한다. 
    index는 selected_columns를 
    column은 통계량을 의미한다. 
    만약에 p_value가 낮다면 이는 해당 columne에서는 어떠한 통계량을 쓰는 것이 좋은지 알 수 있다.
    '''
    p_value_columns =pd.DataFrame()
    for i in range(len(selected_columns)):
        p_value_columns = pd.concat([p_value_columns,perform_t_tests(List1[i],List2[i])])
    p_value_columns.index = selected_columns
    p_value_columns = convert_objects_to_float(p_value_columns)
    return p_value_columns

In [18]:
p_value_62_twothree = total_p_value(stat_list_second,stat_list_third)
p_value_62_twothree

Column,Max,Min,Mean,Rms,Std,Skew,Kurt,Cf,If,Sf,Median,Mode,q1,q3,Iqr
Apparent_Power_Va3,0.030417,0.30051,0.1076,0.088528,0.075279,0.67695,0.51872,0.15716,0.044736,0.12674,0.50486,0.77921,0.81454,0.025591,0.015684
Line_Voltage_V12,0.80638,0.94749,0.98428,0.98411,0.21742,0.091735,0.042488,0.16701,0.16468,0.16689,0.93059,0.85322,0.94235,0.95552,0.32212
x,0.64316,0.90645,0.96492,0.96603,0.80437,0.8741,0.77232,0.56123,0.796,0.56065,0.94047,0.58984,0.99057,0.87887,0.73591
y,0.70607,0.73526,0.98699,0.98723,0.84358,0.79705,0.23449,0.46366,0.91459,0.46255,0.96256,0.71108,0.91689,0.95579,0.52735
z,0.098919,0.18104,0.90912,0.14216,0.14553,0.51544,0.44056,0.61616,0.048524,0.029734,0.76092,0.26948,0.59656,0.78284,0.56794


In [19]:
p_value_62_frontback=total_p_value(stat_list_front,stat_list_back)
p_value_62_frontback

Column,Max,Min,Mean,Rms,Std,Skew,Kurt,Cf,If,Sf,Median,Mode,q1,q3,Iqr
Apparent_Power_Va3,0.000555,0.73847,0.00663,0.004393,0.000504,0.52574,0.0557,0.002529,0.00057,0.002,0.026428,0.39832,0.35836,0.004306,0.000712
Line_Voltage_V12,0.45586,0.40048,0.65437,0.65457,0.81636,0.036501,0.33099,0.25877,0.64359,0.25985,0.80888,0.82969,0.71251,0.65125,0.71552
x,0.87315,0.57119,0.70334,0.70051,0.59268,0.81021,0.54957,0.90367,0.51283,0.90597,0.8099,0.99894,0.51064,0.80974,0.43619
y,0.79096,0.53271,0.97916,0.98095,0.65797,0.4728,0.051689,0.65062,0.50048,0.65152,0.92611,0.93762,0.86201,0.84583,0.20623
z,0.94878,0.9805,0.62681,0.93353,0.985,0.495,0.008467,0.021507,0.84657,0.9417,0.42051,0.868,0.68682,0.06139,0.062283


In [20]:
p_value_62_back=total_p_value(stat_list_back,stat_list_back_second)
p_value_62_back

Column,Max,Min,Mean,Rms,Std,Skew,Kurt,Cf,If,Sf,Median,Mode,q1,q3,Iqr
Apparent_Power_Va3,0.03385,0.47959,0.028298,0.023832,0.016295,0.88719,0.010445,0.13714,0.017936,0.11604,0.13896,0.69142,0.74588,0.011973,0.004218
Line_Voltage_V12,0.74983,0.78198,0.80543,0.80532,0.74983,0.94017,0.54878,0.8161,0.67854,0.81592,0.83453,0.81007,0.82492,0.71127,0.33969
x,0.42141,0.62741,0.7782,0.78208,0.59189,0.44793,0.97168,0.397,0.52581,0.39679,0.93675,0.64858,0.97907,0.71158,0.61443
y,0.7682,0.39677,0.99296,0.99264,0.91819,0.11033,0.62133,0.61688,0.9197,0.61493,0.98228,0.43417,0.9784,0.9992,0.93917
z,0.37886,0.25035,0.12315,0.23431,0.21872,0.049329,0.24844,0.040009,0.053611,0.053165,0.67175,0.51063,0.45965,0.15208,0.10993


In [21]:
## 2분과 3분대에서는? 
update_diff(p_value_62_twothree,p_value)

Column,Max,Min,Mean,Rms,Std,Skew,Kurt,Cf,If,Sf,Median,Mode,q1,q3,Iqr
Apparent_Power_Va3,Diff,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,Diff,NOPE,NOPE,NOPE,NOPE,Diff,Diff
Line_Voltage_V12,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,Diff,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE
x,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE
y,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE
z,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,Diff,Diff,NOPE,NOPE,NOPE,NOPE,NOPE


In [22]:
update_diff(p_value_62_frontback,p_value)

Column,Max,Min,Mean,Rms,Std,Skew,Kurt,Cf,If,Sf,Median,Mode,q1,q3,Iqr
Apparent_Power_Va3,Diff,NOPE,Diff,Diff,Diff,NOPE,NOPE,Diff,Diff,Diff,Diff,NOPE,NOPE,Diff,Diff
Line_Voltage_V12,NOPE,NOPE,NOPE,NOPE,NOPE,Diff,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE
x,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE
y,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE
z,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,Diff,Diff,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE


In [23]:
update_diff(p_value_62_back,p_value)

Column,Max,Min,Mean,Rms,Std,Skew,Kurt,Cf,If,Sf,Median,Mode,q1,q3,Iqr
Apparent_Power_Va3,Diff,NOPE,Diff,Diff,Diff,NOPE,Diff,NOPE,Diff,NOPE,NOPE,NOPE,NOPE,Diff,Diff
Line_Voltage_V12,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE
x,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE
y,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE
z,NOPE,NOPE,NOPE,NOPE,NOPE,Diff,NOPE,Diff,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE


### 66

In [24]:
stat_list_front = [pd.DataFrame() for i in range(len(selected_columns))]
stat_list_second = [pd.DataFrame() for i in range(len(selected_columns))]
stat_list_third = [pd.DataFrame() for i in range(len(selected_columns))]
stat_list_back = [pd.DataFrame() for i in range(len(selected_columns))]
stat_list_back_second = [pd.DataFrame() for i in range(len(selected_columns))]
print(selected_columns)
for dataframe in split_66:
    data = dataframe[selected_columns]
    front = data.iloc[jump: jump+sec,:]
    second = data.iloc[jump+sec: jump+sec*2,:]
    third = data.iloc[jump+sec*2: jump+sec*3,:]
    back = data.iloc[-jump-sec:-jump,:]
    back_second = data.iloc[-jump-sec*2:-jump-sec*1,:]
    for i, column in enumerate(selected_columns): # 앞 부분과 뒷 부분의 통계치 비교 하기 
        stat_list_front[i] = pd.concat([stat_list_front[i],Feature_Make(front[column])],axis=0)
        stat_list_back[i] = pd.concat([stat_list_back[i],Feature_Make(back[column])],axis=0)
        stat_list_second[i] = pd.concat([stat_list_second[i],Feature_Make(second[column])],axis=0)
        stat_list_third[i] = pd.concat([stat_list_third[i],Feature_Make(third[column])],axis=0)
        stat_list_back_second[i] = pd.concat([stat_list_back_second[i],Feature_Make(back_second[column])],axis=0)

['Apparent_Power_Va3', 'Line_Voltage_V12', 'x', 'y', 'z']


  Skew =sp.skew(data)
  Kurt = sp.kurtosis(data)


In [25]:
p_value_66_twothree = total_p_value(stat_list_second,stat_list_third)
p_value_66_twothree

Column,Max,Min,Mean,Rms,Std,Skew,Kurt,Cf,If,Sf,Median,Mode,q1,q3,Iqr
Apparent_Power_Va3,0.004139,0.19509,0.008973,0.008651,0.023495,0.47055,0.62882,0.082017,0.024433,0.063641,0.086901,0.20706,0.059034,0.003595,0.005514
Line_Voltage_V12,0.8396,0.89252,0.9248,0.92468,0.43296,0.26182,0.060994,0.47088,0.33904,0.47058,0.96612,0.98928,0.97467,0.92242,0.5877
x,0.9822,0.84699,0.99782,0.99878,0.80271,0.95378,0.88702,0.96996,0.81534,0.9707,0.99869,0.79688,0.99389,0.93094,0.83464
y,0.73088,0.73748,0.9669,0.96726,0.77579,0.93621,0.15974,0.49498,0.84456,0.49423,0.98722,0.40531,0.88411,0.96054,0.41645
z,0.088202,0.21756,0.8299,0.13394,0.13723,0.84017,0.45525,0.58219,0.051318,0.029624,0.72194,0.41613,0.23491,0.63211,0.24219


In [26]:
p_value_66_frontback = total_p_value(stat_list_front,stat_list_back)
p_value_66_frontback

Column,Max,Min,Mean,Rms,Std,Skew,Kurt,Cf,If,Sf,Median,Mode,q1,q3,Iqr
Apparent_Power_Va3,0.000577,0.60797,0.005305,0.003309,0.00031,0.47652,0.065184,0.002474,0.000377,0.001956,0.012348,0.16279,0.33487,0.003531,0.000517
Line_Voltage_V12,0.37844,0.36099,0.57705,0.57723,0.84712,0.034667,0.48354,0.2207,0.65563,0.22171,0.72686,0.74484,0.64404,0.57035,0.64562
x,0.81956,0.68836,0.80938,0.80701,0.7196,0.86801,0.54573,0.90188,0.60803,0.89927,0.89411,0.90402,0.63199,0.87788,0.56329
y,0.96901,0.59794,0.959,0.96039,0.76272,0.91845,0.0116,0.99681,0.57461,0.99899,0.91798,0.43745,0.9153,0.83195,0.24952
z,0.85507,0.98573,0.72619,0.87544,0.93732,0.23782,0.002322,0.01888,0.853,0.93914,0.54691,0.81965,0.74141,0.10888,0.087533


In [27]:
p_value_66_back=total_p_value(stat_list_back,stat_list_back_second)
p_value_66_back

Column,Max,Min,Mean,Rms,Std,Skew,Kurt,Cf,If,Sf,Median,Mode,q1,q3,Iqr
Apparent_Power_Va3,0.021929,0.47756,0.012713,0.010193,0.007925,0.86493,0.011222,0.11778,0.008649,0.096297,0.059346,0.76449,0.83319,0.00403,0.001226
Line_Voltage_V12,0.73045,0.81723,0.79673,0.7966,0.67195,0.82181,0.91228,0.77826,0.64158,0.77808,0.81627,0.74756,0.82061,0.69383,0.28036
x,0.47964,0.49568,0.85699,0.8603,0.62439,0.81155,0.80048,0.38977,0.57574,0.38927,0.95111,0.52722,0.99478,0.8416,0.74281
y,0.96444,0.45892,0.98099,0.98015,0.78797,0.18955,0.29758,0.95944,0.79809,0.95807,0.98877,0.27311,0.99776,0.97767,0.93669
z,0.16482,0.094934,0.13296,0.087874,0.080758,0.081975,0.08783,0.016532,0.022755,0.018754,0.69904,0.22036,0.27129,0.14692,0.064999


In [28]:
update_diff(p_value_66_twothree,p_value)

Column,Max,Min,Mean,Rms,Std,Skew,Kurt,Cf,If,Sf,Median,Mode,q1,q3,Iqr
Apparent_Power_Va3,Diff,NOPE,Diff,Diff,Diff,NOPE,NOPE,NOPE,Diff,NOPE,NOPE,NOPE,NOPE,Diff,Diff
Line_Voltage_V12,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE
x,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE
y,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE
z,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,Diff,NOPE,NOPE,NOPE,NOPE,NOPE


In [29]:
update_diff(p_value_66_frontback,p_value)

Column,Max,Min,Mean,Rms,Std,Skew,Kurt,Cf,If,Sf,Median,Mode,q1,q3,Iqr
Apparent_Power_Va3,Diff,NOPE,Diff,Diff,Diff,NOPE,NOPE,Diff,Diff,Diff,Diff,NOPE,NOPE,Diff,Diff
Line_Voltage_V12,NOPE,NOPE,NOPE,NOPE,NOPE,Diff,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE
x,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE
y,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,Diff,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE
z,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,Diff,Diff,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE


In [30]:
update_diff(p_value_66_back,p_value)

Column,Max,Min,Mean,Rms,Std,Skew,Kurt,Cf,If,Sf,Median,Mode,q1,q3,Iqr
Apparent_Power_Va3,Diff,NOPE,Diff,Diff,Diff,NOPE,Diff,NOPE,Diff,NOPE,NOPE,NOPE,NOPE,Diff,Diff
Line_Voltage_V12,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE
x,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE
y,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE
z,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,NOPE,Diff,Diff,Diff,NOPE,NOPE,NOPE,NOPE,NOPE
