In [9]:
import pandas as pd
import math
import numpy as np

# Read the data
def readFileWithoutOutliers():
    df = pd.read_csv("../data/data_train.csv")
    boolean_column = ['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']
    
    for column in boolean_column:
        df[column] = df[column].astype(bool)
    Q1 = df['fc'].quantile(0.25)
    Q3 = df['fc'].quantile(0.75)
    IQR = Q3 -Q1
    upper_bound = Q3 + 1.5 * IQR
    lower_bound = Q1 - 1.5 * IQR
    
    df_no_outliers = df[(df['fc'] > lower_bound) & (df['fc'] < upper_bound)]
    return df_no_outliers

def classifyData(data, bins1, labels1):
    classified_data = pd.cut(data, bins=bins1, labels=[str(label) for label in labels1])
    return classified_data
def changeBecameClassifyData(df):
    df["battery_power"] = classifyData(df["battery_power"], [500, 864, 1218, 1600, 1998], ['battery_power1', 'battery_power2', 'battery_power3', 'battery_power4'])
    df["clock_speed"] = classifyData(df["clock_speed"], [0.4, 0.7, 1.5, 2.2, 3], ["clock_speed1", "clock_speed2", "clock_speed3", "clock_speed4"])
    df["fc"] = classifyData(df["fc"], [-0.1, 1, 3, 7, 15], ["fc1", "fc2", "fc3", "fc4"])
    df["int_memory"] = classifyData(df["int_memory"], [1.9, 16, 32, 48, 64], ["int_memory1", "int_memory2", "int_memory3", "int_memory4"])
    df["m_dep"] = classifyData(df["m_dep"], [0, 0.2, 0.5, 0.8, 1], ["m_dep1", "m_dep2", "m_dep3", "m_dep4"])
    df["mobile_wt"] = classifyData(df["mobile_wt"], [79, 107.5, 139, 169, 200], ["mobile_wt1", "mobile_wt2", "mobile_wt3", "mobile_wt4"])
    df["n_cores"] = classifyData(df["n_cores"], [0.9, 2, 4, 7, 8], ["n_cores1", "n_cores2", "n_cores3", "n_cores4"])
    df["pc"] = classifyData(df["pc"], [-0.1, 5, 10, 15, 20], ["pc1", "pc2", "pc3", "pc4"])
    df["px_height"] = classifyData(df["px_height"], [-0.1, 273, 560, 946, 1960], ["px_height1", "px_height2", "px_height3", "px_height4"])
    df["px_width"] = classifyData(df["px_width"], [499.5, 878, 1247, 1623, 1998], ["px_width1", "px_width2", "px_width3", "px_width4"])
    df["sc_h"] = classifyData(df["sc_h"], [4.9, 9, 12, 16, 19], ["sc_h1", "sc_h2", "sc_h3", "sc_h4"])
    df["sc_w"] = classifyData(df["sc_w"], [-0.1, 2, 5, 9, 18], ["sc_w1", "sc_w2", "sc_w3", "sc_w4"])
    df["talk_time"] = classifyData(df["talk_time"], [1.9, 6, 11, 16, 20], ["talk_time1", "talk_time2", "talk_time3", "talk_time4"])
    return df 
def classify(namaKolom, value, df):
    if df[namaKolom].dtype != bool:
        Q1 = df[namaKolom].min()
        Q2 = df[namaKolom].quantile(0.25)
        Q3 = df[namaKolom].quantile(0.5)
        Q4 = df[namaKolom].quantile(0.75)
        Q5 = df[namaKolom].max()
        if(value >= Q1 and value <= Q2):
            return 1
        elif(value > Q2 and value <= Q3):
            return 2
        elif(value > Q3 and value <= Q4):
            return 3
        elif(value > Q4 and value <= Q5):
            return 4
    else:
        if value == True:
            return 1
        else:
            return 2
        
#Hanya untuk kolom RAM 
def euclideanDistance(value, value_ram):
    hasil = math.sqrt((value_ram - value)**2)
    return hasil
def changeInputValidation(values, df):
    i = 0
    for kolom in df.columns:
        if kolom != "ram" and df[kolom].dtype != bool and kolom != "price_range":
            values[i] = f'{kolom}{classify(kolom, values[i], df)}'
        elif df[kolom].dtype == bool:
            if values[i] == 0:
                values[i] = False
            else:
                values[i] = True
            
        i += 1
    return values
def makeArrayModel(df):
    array_hasil = []
    for index, row in df1.iterrows():
        array_hasil.append(row.values)
    array_hasil = np.array(array_hasil)
    return array_hasil
def countDifferent(arrays, value):
    counts = []
    ram_differents = []
    price_ranges = []
    count = 0
    ram_different = 0
    for array in arrays:
        for i in range(len(array)-1):
            if i != 13:
                if(array[i] != value[i]):
                    count += 1
            else:
                ram_different = euclideanDistance(array[i], value[i])
        counts.append(count)
        ram_differents.append(ram_different)
        count = 0
        ram_different = 0
        price_ranges.append(array[len(array)-1])
    return counts, ram_differents, price_ranges

def writeModelKNN(array1, array2, array3, file_path):
    data_list = list(zip(array1, array2, array3))
    with open(file_path, 'w') as file:
        file.write("Count\tRAM_Difference\tPrice_Range\n")
        for data in data_list:
            file.write('\t'.join(map(str, data)) + '\n')
def makeDataframeSorted(file_name, pd, kolom1, kolom2, k):
    data_list = []
    with open(file_name, 'r') as file:
        lines = file.readlines()
    for line in lines[1:]:
        values = line.strip().split()
        data_dict = {
            'Count': int(values[0]),
            'RAM_Difference': float(values[1]),
            'Price_Range': int(values[2])
        }
        data_list.append(data_dict)
    data = pd.DataFrame(data_list)
    sorted_data = data.sort_values(by=[kolom1, kolom2])[:k]
    return sorted_data
df = readFileWithoutOutliers()           
df2 = readFileWithoutOutliers()
df1 = changeBecameClassifyData(df2)
values = [775,0,1.0,0,3,0,46,0.7,159,2,16,862,1864,568,17,15,11,1,1,1]
array = makeArrayModel(df1)
input = changeInputValidation(values, df)
count, ram, price_ranges = countDifferent(array, input)
writeModelKNN(count, ram, price_ranges, "hasilknn.txt")
sorted_data = makeDataframeSorted("hasilknn.txt", pd, "RAM_Difference", "Count", 28)
print(sorted_data["Price_Range"].mode().iloc[0])

0
