In [11]:
import pandas as pd

file = "Laptop_price.csv"

df = pd.read_csv(file)

print("Loaded!")
print(df)

Loaded!
      Brand  Processor_Speed  RAM_Size  Storage_Capacity  Screen_Size  \
0      Asus         3.830296        16               512    11.185147   
1      Acer         2.912833         4              1000    11.311372   
2    Lenovo         3.241627         4               256    11.853023   
3      Acer         3.806248        16               512    12.280360   
4      Acer         3.268097        32              1000    14.990877   
..      ...              ...       ...               ...          ...   
995      HP         3.343584         4              1000    12.587095   
996    Dell         2.780555         8               256    12.679356   
997    Dell         3.200569         4               512    12.666315   
998    Asus         1.604182         8               256    11.215581   
999  Lenovo         1.711980         4               256    16.561498   

       Weight         Price  
0    2.641094  17395.093065  
1    3.260012  31607.605919  
2    2.029061           N

Удаление пропусков

In [12]:
def clear_dataset_rows(df: pd.DataFrame, percent: float) -> None:
    count = df.columns.size
    for index, row in df.iterrows():
        empty = 0
        for v in row:
            if pd.isnull(v):
                empty+=1
        if empty/count > percent:
            df.drop(index, axis=0, inplace=True)
            
clear_dataset_rows(df, 0.5)    
print(df)
print(df.dtypes)

      Brand  Processor_Speed  RAM_Size  Storage_Capacity  Screen_Size  \
0      Asus         3.830296        16               512    11.185147   
1      Acer         2.912833         4              1000    11.311372   
2    Lenovo         3.241627         4               256    11.853023   
3      Acer         3.806248        16               512    12.280360   
4      Acer         3.268097        32              1000    14.990877   
..      ...              ...       ...               ...          ...   
995      HP         3.343584         4              1000    12.587095   
996    Dell         2.780555         8               256    12.679356   
997    Dell         3.200569         4               512    12.666315   
998    Asus         1.604182         8               256    11.215581   
999  Lenovo         1.711980         4               256    16.561498   

       Weight         Price  
0    2.641094  17395.093065  
1    3.260012  31607.605919  
2    2.029061           NaN  
3  

Восстановление пропусков

In [13]:
from collections import Counter
from pandas.core.dtypes.common import is_numeric_dtype


def recovery_dataset(df: pd.DataFrame):
    for column in df.columns:
        column_type = df[column].dtype
        if column_type == "object":
            c = Counter(df[column].values)
            com = c.most_common()[0]
            if pd.isnull(com[0]):
                com = c.most_common()[1]
            df[column] = df[column].fillna(com[0])
        if is_numeric_dtype(column_type):
            total_sum = 0
            count = 0
            for v in df[column].values:
                if not pd.isnull(v):
                    total_sum+=v
                    count+=1
            avg = total_sum / count
            df[column] = df[column].fillna(avg)
recovery_dataset(df)
print(df)

      Brand  Processor_Speed  RAM_Size  Storage_Capacity  Screen_Size  \
0      Asus         3.830296        16               512    11.185147   
1      Acer         2.912833         4              1000    11.311372   
2    Lenovo         3.241627         4               256    11.853023   
3      Acer         3.806248        16               512    12.280360   
4      Acer         3.268097        32              1000    14.990877   
..      ...              ...       ...               ...          ...   
995      HP         3.343584         4              1000    12.587095   
996    Dell         2.780555         8               256    12.679356   
997    Dell         3.200569         4               512    12.666315   
998    Asus         1.604182         8               256    11.215581   
999  Lenovo         1.711980         4               256    16.561498   

       Weight         Price  
0    2.641094  17395.093065  
1    3.260012  31607.605919  
2    2.029061  19611.269036  
3  

Вычисление матрицы корреляций

In [14]:
from math import sqrt

def correlation_matrix(df: pd.DataFrame) -> pd.DataFrame: 
    available_columns = []
    for column in df.columns:
        if is_numeric_dtype(df[column].dtype):
            available_columns.append(column)
    dispersions = {}
    avgs = {}
    for column in available_columns:
        square_sum = 0
        elem_sum = 0
        n = len(df[column].values)
        for val in df[column].values:
            square_sum += val*val
            elem_sum += val
        avg = elem_sum / n
        dispersions[column] = sqrt(square_sum / n - avg*avg)
        avgs[column] = avg
    result = [[c] for c in available_columns]
    for i in range(len(available_columns)):
        for j in range(len(available_columns)):
            c1 = available_columns[i]
            c2 = available_columns[j]
            if j > i:
                elem_sum = 0
                count = min(len(df[c1].values), len(df[c2].values))
                for k in range(count):
                    elem_sum += df[c1].values[k] * df[c2].values[k]
                elem_sum = elem_sum / count
                result[i].append((elem_sum - avgs[c1] * avgs[c2]) / (dispersions[c1] * dispersions[c2]))
            else:
                result[i].append(None)
    available_columns.insert(0, "")
    result.append(available_columns)
    return result

for l in correlation_matrix(df):
    for v in l:
        v = str(v)
        print(v.rjust(15)[:15], end=' ')
    print()

Processor_Speed            None 0.0213914402499 -0.067576486128 -0.013425460918 -0.054603421487 -0.052388651117 
       RAM_Size            None            None 0.0021111206978 -0.036002173555 -0.025381654469 0.0581364510768 
Storage_Capacit            None            None            None -0.027559966015 0.0413350855564 0.9959272812838 
    Screen_Size            None            None            None            None 0.0219945827551 -0.030327113072 
         Weight            None            None            None            None            None 0.0371859589309 
          Price            None            None            None            None            None            None 
                Processor_Speed        RAM_Size Storage_Capacit     Screen_Size          Weight           Price 


Функции для энтропии и gain ration

In [15]:
from math import log2
    
def entropy(s : pd.Series) -> float:
    count = len(s)
    classes = s.value_counts()
    return sum([ -p * log2(p) for p in [ c / count for c in classes]])

def information_gain(data: pd.DataFrame, target: str, feature: str) -> float:
    target_size = len(data[target])
    gain = entropy(data[target])
    for u in data[feature].unique():
        target_part = data[data[feature] == u][target]
        gain -= entropy(target_part) * (len(target_part) / target_size)
    return gain


def gain_ratio(data: pd.DataFrame, target: str, feature: str) -> float:
    info = 0
    target_size = len(data[target])
    for u in data[feature].unique():
        target_part = data[data[feature] == u][target]
        weight = len(target_part) / target_size
        info -= weight * log2(weight)
    return information_gain(data, target, feature) / info


def all_gain_ration(data: pd.DataFrame, target: str, features: list[str] = None) -> dict[str, float]:
    if not features is list[str] or len(features) == 0:
        features = [column for column in data if column != target]
    return {column: gain_ratio(data, column, target) for column in features}

def all_information_gain(data: pd.DataFrame, target: str, features: list[str] = None) -> dict[str, float]:
    if not features is list[str] or len(features) == 0:
        features = [column for column in data if column != target]
    return {i[0]:i[1] for i in sorted({column: information_gain(data, column, target) for column in features}.items(), key=lambda x: x[1], reverse=True)}

Вычисление энтропии и gain ratio

In [16]:
df['Price'] = df["Price"].apply(lambda x:  'Low' if x < 15000 else ('Medium' if x < 25000 else 'Hight'))

In [17]:
ig = {}
for col in df.columns:
  if col != 'Price':
      ig[col] = information_gain(df, 'Price', col)
ig = dict(sorted(ig.items(), key=lambda item: item[1], reverse=True))

In [18]:
gr = {}
for col in df.columns:
  if col != 'Price':
      gr[col] = gain_ratio(df, 'Price', col)
gr = dict(sorted(gr.items(), key=lambda item: item[1], reverse=True))

In [19]:
print("Gain ratio")
for item in gr.items():
    print(f'{item[0]}:{item[1]}')


Gain ratio
Storage_Capacity:0.9828497733170076
Processor_Speed:0.15881893307315909
Screen_Size:0.15881893307315909
Weight:0.15881893307315909
Brand:0.0029636300610715546
RAM_Size:0.0023856030878556243


In [20]:
print("Information gain")
for item in ig.items():
    print(f'{item[0]}:{item[1]}')

Information gain
Processor_Speed:1.5827552273272776
Screen_Size:1.5827552273272776
Weight:1.5827552273272776
Storage_Capacity:1.5550855107150214
Brand:0.0068775591153331095
RAM_Size:0.00476572724996871
