In [None]:
#downloading required libraries
!pip install matplotlib
!pip install seaborn
!pip install numpy
!pip install pandas
!pip install scikit-learn

IMPORTS

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

REQUIRED FUNCTIONS

In [4]:

# func for one-hot encoding (useless for CatBoost regressor)
def encode_text_columns(df: pd.DataFrame) -> pd.DataFrame:
    # copy df
    encoded_df = df.copy()
    # iterate DataFrame through columns
    for column in encoded_df.columns:
        # check data
        if encoded_df[column].dtype == object and column != 'timestamp':
            # one-hot encoding
            encoded_df[column], _ = pd.factorize(encoded_df[column])
    return encoded_df


def remove_columns_with_missing_data(df: pd.DataFrame, percent: int) -> pd.DataFrame:
    # copy df
    df_cleaned = df.copy()
    # iterate DataFrame through columns
    for column in df_cleaned.columns:
        # Calculate the percentage of missing data for each column
        missing_percent = (df_cleaned[column].isnull().sum() / len(df_cleaned)) * 100
        # If the percentage of missing data is greater than the specified X, delete the column
        if missing_percent > percent:
            df_cleaned.drop(column, axis = 1, inplace = True)
    return df_cleaned


def mean_filling_missing_data(df: pd.DataFrame) -> pd.DataFrame:
    # copy df
    df = df.copy()
    # iterate DataFrame through columns
    # Filling missing values for continuously columns with the mean value
    for column in df.select_dtypes(include = ['number']).columns:
        df[column].fillna(df[column].mean(), inplace = True)
    # Filling missing values for categorical columns with the most common value
    for column in df.select_dtypes(include = ['object']).columns:
        df[column].fillna(df[column].mode()[0], inplace = True)
    return df


def remove_corr_data(df: pd.DataFrame, threshold: float) -> pd.DataFrame:
    # Creating corr matrix
    df1 = df.copy()
    df = encode_text_columns(df.copy())
    try:
        del df['timestamp']
    except KeyError:
        pass
    correlation_matrix = df.corr().abs()

    # Finding the indices of columns with correlations above the threshold
    above_threshold_vars = {}
    for col in correlation_matrix:
        above_threshold_vars[col] = list(correlation_matrix.index[(1 > correlation_matrix[col]) & (correlation_matrix[col] > threshold)])
    # Discard one of the columns from each pair with high correlation
    cols_to_drop = set()
    for col in above_threshold_vars:
        cols_to_drop.update(set(above_threshold_vars[col]) - {col})
    # Del columns
    df_reduced = df1.drop(columns=cols_to_drop)
    return df_reduced


def construct_heatmap(df: pd.DataFrame) -> None:
    # Construct heatmap and corr matrix
    corr_matrix = encode_text_columns(df)
    try:
        del corr_matrix['timestamp']
    except:
        pass
    sns.heatmap(corr_matrix.corr())




def plot_data(df):
    for column in df.columns:
        if pd.api.types.is_integer_dtype(df[column]):
            # Construct a histogram for integer values
            df[column].hist()
            plt.title(f'Histogram of {column}')
            plt.xlabel('Value')
            plt.ylabel('Number of objects')
            plt.show()
        elif pd.api.types.is_float_dtype(df[column]):
            # Construct a distribution graph for continuous values
            sns.histplot(df[column], kde=True)
            plt.title(f'Distribution of data in the {column} column')
            plt.xlabel('Value')
            plt.ylabel('Density')
            plt.show()
            
def split_into_price_ranges(df, N):
    # Check that N is not greater than the number of objects in the DataFrame
    if N > len(df):
        raise ValueError("N не может быть больше количества объектов в DataFrame")
    
    #   Using qcut to split into groups
    df['price_group'] = pd.qcut(df['price_doc'], q=N, labels=[i for i in range(N)])
    
    # Outputting intervals and class labels
    intervals = pd.qcut(df['price_doc'], q=N).unique()
    for interval in sorted(intervals):
        print(f"Интервал: {interval}, Метка класса: {df[df['price_doc'].between(interval.left, interval.right)]['price_group'].iloc[0]}")
    
    # Returning a new dataset
    return df

def visualize_price_distribution(price_column: pd.Series) -> None:
    plt.figure(figsize=(10, 6))
    plt.hist(price_column, bins=60, color='blue', edgecolor='black')
    plt.title('Распределение цен на недвижимость')
    plt.xlim(0,30*10**6)
    plt.xlabel('Цена')
    plt.ylabel('Количество объектов')
    plt.grid(True)
    plt.show()


READING DATASETS

In [5]:
#reading csv files
train_df = pd.read_csv("datasets/train.csv", index_col=0)
macro_df = pd.read_csv("datasets/macro.csv")

EDA

In [None]:
#information about datasets
for dataset in [train_df, macro_df]:
    print(dataset.info())
    print('-'*100)

In [None]:
train_df.head()

In [None]:
macro_df.head()

MERGING DATASETS  

In [None]:
df = pd.merge(train_df, macro_df,)
df.info()

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df = remove_columns_with_missing_data(df, 5)
df.shape

In [None]:
df = mean_filling_missing_data(df)
df.describe()

CONSTRUCT HEATMAP AND CORR MATRIX

In [None]:
construct_heatmap(df)

REMOVE EXTRA DATA

In [None]:
df = remove_corr_data(df,0.6)
df.shape
df

In [None]:
df.describe()

In [None]:
construct_heatmap(df)

In [None]:
set(df.dtypes)

In [1]:
plot_data(df)

SAVE DATASET FOR REG_MODEL 

In [188]:
df.to_csv('datasets/out_dataset_2_porog_60.csv', index=False) 

ANALIZ DATA FOR CAT_DATASET

In [None]:
df['price_doc'].info()

In [None]:
visualize_price_distribution(df['price_doc'])

In [None]:
new_df = split_into_price_ranges(df, 35)

In [None]:
new_df['price_group']

In [None]:
visualize_price_distribution(new_df['price_group'])

SAVE DATASET FOR CAT_MODEL

In [26]:
new_df.to_csv('datasets/out_dataset_3_class_porog_60.csv', index=False) 