In [None]:
import numpy as np
import pandas as pd
from dash import dash

In [None]:
from ProjektZaliczeniowy import Dashboard

In [None]:
def set_settings():
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pd.set_option('display.width', None)

In [None]:
def get_data_from_file(file_name):
    data_ = pd.read_csv(file_name)
    return data_

In [None]:
def get_data_in_numpy_array(pd_array):
    data_1 = np.array(pd_array)
    return data_1

In [None]:
def outlier_data(out_data):
    # Zastosowanie zasady IQR do identyfikacji danych odstajÄ…cych
    Q1 = out_data['price'].quantile(0.25) #Pierwszy kwartyl
    Q3 = out_data['price'].quantile(0.75) #Trzeci kwartyl
    IQR = Q3 - Q1  # RozstÄ™p Ä‡wiartkowy rĂłĹĽnica miÄ™dzy trzecim a pierwszym kwartylem
    lower_bound = Q1 - 1.5 * IQR # Dolna granica
    upper_bound = Q3 + 1.5 * IQR # GĂłrna granica
    out_data = out_data[(out_data['price'] >= lower_bound) & (out_data['price'] <= upper_bound)]

    # print("Dane odstajÄ…ce:")
    return out_data

In [None]:
def data_clean(data):
    # UsuniÄ™cie spacji przed nazwami kolumn
    data.columns = [col.strip() for col in data.columns]
    # print(data.info())
    # print(data['price'].describe())

    # Zmiana bĹ‚Ä™dnych/pustych wartoĹ›ci na NaN
    data = data.replace({"": np.nan, " ": np.nan})
    # print(data.isna().any())
    # print(data.isna().sum())

    # Zmiana na duĹĽe litery dla zgodnoĹ›ci
    data['cut'] = data['cut'].str.upper()
    data['color'] = data['color'].str.upper()
    data['clarity'] = data['clarity'].str.upper()

    # UsuniÄ™cie wierszy dla wartoĹ›ci NaN w kolumnie price
    data = data.dropna(subset=['price'])

    # Wstawienie mediany dla pĂłl carat, x dimension, y dimension, z dimension, depth, table
    data['carat'].fillna(data['carat'].median(), inplace=True)
    data['x dimension'].fillna(data['x dimension'].median(), inplace=True)
    data['y dimension'].fillna(data['y dimension'].median(), inplace=True)
    data['z dimension'].fillna(data['z dimension'].median(), inplace=True)
    data['depth'].fillna(data['depth'].median(), inplace=True)
    data['table'].fillna(data['table'].median(), inplace=True)

    # Konwersja danych na poprawny typ
    data['price'] = data['price'].astype('float64')
    data['carat'] = data['carat'].astype('float64')
    data['x dimension'] = data['x dimension'].astype('float64')
    data['y dimension'] = data['y dimension'].astype('float64')
    data['z dimension'] = data['z dimension'].astype('float64')
    data['depth'] = data['depth'].astype('float64')
    data['table'] = data['table'].astype('int64')
    data['clarity'] = data['clarity'].astype('string')
    data['color'] = data['color'].astype('string')
    data['cut'] = data['cut'].astype('string')

    # UsuniÄ™cie wartoĹ›ci odstajÄ…cych
    data = outlier_data(data)
    # print(data)

    # Suma zduplikowanych wartoĹ›ci
    # print(data.duplicated().sum())

    # WartoĹ›ci w tych kolumnach posiadajÄ… poprawnÄ… wartoĹ›Ä‡ ze zbioru co wskazuje na to ĹĽe sÄ… najbardziej prawdopodobnie rzeczywiste
    # UsuniÄ™cie duplikatĂłw
    data = data.drop_duplicates(subset=["clarity", "color", "cut", 'x dimension'], keep='first')
    # print(data)
    return data

In [None]:
if __name__ == '__main__':
    app = dash.Dash(__name__)
    set_settings()
    data_from_file = get_data_from_file('Dane/messy_data.csv')
    cleaned_data = data_clean(data_from_file)
    Dashboard.dashboard_creation(cleaned_data, app)