In [None]:
import typing as t
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
TRAIN_PATH = "../../data/train.csv"
TEST_PATH = "../../data/test.csv"

SUBMISSION_PATH = "../../data/sample_submission.csv"

In [None]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

subm = pd.read_csv(SUBMISSION_PATH)

In [None]:
print(train_df.columns.tolist())

# Whole data

In [None]:
train_df.info()

In [None]:
subm

## Utils

In [None]:
def plot_hists(df: pd.DataFrame, f_names: t.List[str]) -> None:
    num_rows = int(math.ceil(len(f_names)/2))

    fig, axs = plt.subplots(num_rows, 2, figsize=(14, 14))

    for i, f_name in enumerate(f_names):
        sns.histplot(data=df, x=f_name, kde=False, color="gold", ax=axs[math.floor(i/2), int(i%2!=0)])

    plt.show()

# Features

## Global Distances [ok]

road_distance_1, road_distance_2, bulvar_ring_km, kremlin_distance, mkad_distance, sadovoe_km

In [None]:
f_names = [
    "road_distance_1", 
    "road_distance_2", 
    "bulvar_ring_km", 
    "kremlin_distance", 
    "mkad_distance", 
    "sadovoe_km",
]

In [None]:
_df = train_df[f_names]
_df_test = test_df[f_names]
_df.describe()

In [None]:
_df_test.describe()

In [None]:
plot_hists(_df, f_names)

In [None]:
plot_hists(_df_test, f_names)

## Local distances [ok]

bus_station_distance, fitness_center_distance, green_zone_distance, park_distance, public_transport_station_distance, base_school_distance, railway_station_distance

In [None]:
f_names = [
    "bus_station_distance", 
    "fitness_center_distance", 
    "green_zone_distance", 
    "park_distance", 
    "public_transport_station_distance", 
    "base_school_distance", 
    "railway_station_distance",
]

In [None]:
_df = train_df[f_names]
_df_test = test_df[f_names]
_df.describe()

In [None]:
plot_hists(_df, f_names)

In [None]:
plot_hists(_df_test, f_names)

In [None]:
_df[_df["public_transport_station_distance"] > 100]

In [None]:
_df_test[_df_test["public_transport_station_distance"] > 100]

## Local info [ok]

cafe_count, green_part, healthcare_centers_count, leisure_count, office_count, sport_count, malls_count

In [None]:
f_names = [
    "cafe_count", 
    "green_part", 
    "healthcare_centers_count",
    "leisure_count", 
    "office_count", 
    "sport_count", 
    "malls_count",
    "metro_minutes",
]

In [None]:
_df = train_df[f_names]
_df_test = test_df[f_names]
_df.describe()

In [None]:
_df_test.describe()

In [None]:
plot_hists(_df, f_names)

In [None]:
plot_hists(_df_test, f_names)

In [None]:
_df[_df["metro_minutes"] > 400]

In [None]:
train_df.iloc[6152]

## Flat info [remove outliers]

floor, total_area, kitchen_area, rooms_num

In [None]:
f_names = [
    "floor", 
    "total_area", 
    "kitchen_area", 
    "living_area",
    "rooms_num",
    "wall_material",
]

In [None]:
len(train_df), len(test_df)

In [None]:
_df = train_df[f_names]
_df_test = test_df[f_names]
_df.describe()

In [None]:
_df_test.describe()

In [None]:
plot_hists(_df, f_names)

In [None]:
plot_hists(_df_test, f_names)

### kitchen_area

In [None]:
_df[_df["kitchen_area"] > 100]

In [None]:
train_df.iloc[10368]

In [None]:
_df_test[_df_test["kitchen_area"] > 100]

In [None]:
_df[_df["kitchen_area"] >= _df["total_area"]]

In [None]:
_df_test[_df_test["kitchen_area"] >= _df_test["total_area"]]

In [None]:
# build linear reg from total_area x rooms_num -> kitchen_area
from sklearn.linear_model import LinearRegression
# clean_df = 
# lin_reg = LinearRegression()

set to None

### living_area

In [None]:
_df[_df["living_area"] > 400]

In [None]:
_df_test[_df_test["living_area"] >= _df_test["total_area"]]

### total_area

In [None]:
_df[_df["total_area"] < 10]

In [None]:
train_df.iloc[2009]

In [None]:
_df[_df["total_area"] > 300]

### floor

In [None]:
_df[_df["floor"] == 0]

set to None

### rooms_num

In [None]:
_df[_df["rooms_num"] == 0]

In [None]:
_df[_df["rooms_num"] > 5]

set to None

## Other [remove outlier]

product_type, state, district_name, district_population

In [None]:
f_names = [
    "product_type", 
    "state",
    "district_name",
    "district_population",
]

In [None]:
_df = train_df[f_names]
_df_test = test_df[f_names]
_df.describe()

In [None]:
_df_test.describe()

In [None]:
plot_hists(_df, ["state", "district_population", "state"])

In [None]:
plot_hists(_df_test, ["state", "district_population", "state"])

### state

In [None]:
_df[_df["state"] > 4]

In [None]:
_df[_df["district_name"] == "Cheremushki"]["state"].value_counts()

set `state` to None

In [None]:
train_df.district_name

### product_type

In [None]:
_df["product_type"].value_counts(), np.sum(_df["product_type"].isna())

In [None]:
_df_test["product_type"].value_counts(), np.sum(_df_test["product_type"].isna())

### district_name

In [None]:
_df["district_name"].value_counts()

In [None]:
_df_test["district_name"].value_counts()

one-hot? encoding

## Time [remove outliers]

year_of_construction

In [None]:
f_names = ["year_of_construction"]

In [None]:
_df = train_df[f_names]
_df_test = test_df[f_names]
_df.describe()

In [None]:
_df_test.describe()

In [None]:
_df[(_df["year_of_construction"] < 1900) | (_df["year_of_construction"] > 2020)]

In [None]:
_df[_df["year_of_construction"] < 1000]

In [None]:
_df_test[(_df_test["year_of_construction"] < 1900) | (_df_test["year_of_construction"] > 2020)]

In [None]:
_df_test[_df_test["year_of_construction"] < 1900]