In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
np.set_printoptions(suppress=True)
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from sklearn.preprocessing import LabelEncoder, StandardScaler

import lightgbm as lgb
from sklearn.metrics import mean_squared_error
pd.set_option('display.max_columns', None)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import zipfile
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

with zipfile.ZipFile("/kaggle/input/sberbank-russian-housing-market/train.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("/train")

with zipfile.ZipFile("/kaggle/input/sberbank-russian-housing-market/test.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("/test")

In [None]:
df_train = pd.read_csv('/train/train.csv')
df_test = pd.read_csv('/test/test.csv')
train = df_train
test = df_test
np.random.seed(0)

# EDA

In [None]:
# Missing Values
features_with_na = [feature for feature in train.columns if train[feature].isnull().sum()>1]

for feature in features_with_na:
    print(f"The amount of NA in", feature, np.round(train[feature].isnull().mean(),3))

In [None]:

# The relationship between the missing values and the target variable
for feature in features_with_na:
    data = train.copy()

    # let's make a variable that indicates 1 if the observation was missing or zero otherwise
    data[feature] = np.where(data[feature].isnull(), 1, 0)

    # let's calculate the mean price_doc where the information is missing or present

    data.groupby(feature)['price_doc'].mean().plot.bar()
    plt.ticklabel_format(style='plain', axis='y')
    plt.title(feature)
    plt.show()

In [None]:
# List of numerical variables
numerical_features = [feature for feature in train.columns if train[feature].dtype!='O']

# Number of num features
print('Number of num features:', len(numerical_features)) # shape of data (30471, 292)

In [None]:
# List of categorical variables
categorical_features = [feature for feature in train.columns if train[feature].dtype == 'O']

# Number of num features
print('Number of num features:', len(categorical_features)) # shape of data (30471, 292)

In [None]:
# Missing values in Categorical variables
for feature in categorical_features:
    print('The feature is {} and number of categories are {}'.format(feature,len(train[feature].unique())))


In [None]:
# The effect of the Missing data on the target variable
for feature in categorical_features:
    data=train.copy()
    data.groupby(feature)['price_doc'].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('price_doc')
    plt.title(feature)
    plt.show()

# Feature Engineering

In [None]:
# Replace missing value with the label "Missing"
def replace_missing_value(train,features_with_na, value):
    data = train.copy()
    data[features_with_na]=data[features_with_na].fillna(value)
    return data

train=replace_missing_value(train,categorical_features, 'Missing')
test=replace_missing_value(test,categorical_features, 'Missing')

print(train[categorical_features].isnull().sum())
print(test[categorical_features].isnull().sum())

In [None]:
numerical_features.remove('price_doc')
# Replace NA in numerical features with median
def replace_missing_median(data,features):
    for feature in features:
    ## We will replace by using median since there are outliers
        median_value=data[feature].median()
        data[feature].fillna(median_value,inplace=True)
    return data
train = replace_missing_median(train,numerical_features)
test = replace_missing_median(test,numerical_features)


In [None]:
# Dealing with timestamp to get the year sold
def from_timestamp(train):
    train['timestamp_parsed'] = pd.to_datetime(train['timestamp'], format='%Y-%m-%d') # Format 2011-08-20 : %Y-%m-%d
    train['sold_year'] = train['timestamp_parsed'].dt.year
    train['sold_month'] = train['timestamp_parsed'].dt.month
    train['sold_day'] = train['timestamp_parsed'].dt.day
    return train

train = from_timestamp(train)
test = from_timestamp(test)

In [None]:
train = train.drop(['timestamp','timestamp_parsed'], axis=1)
test = test.drop(['timestamp','timestamp_parsed'], axis=1)

sns.histplot(train['sold_year'], kde=False, bins=5)


In [None]:
categorical_features.remove('timestamp')
def feature_scaling(train, categorical_features):
    for feature in categorical_features:
        lb = LabelEncoder()
        train[feature] = lb.fit_transform(train[feature])
    return train

train = feature_scaling(train, categorical_features)
test = feature_scaling(test, categorical_features)

In [None]:
# Check the correlation between features
numerical_features.remove('id')

corrmat = train[numerical_features].corr()
fig, ax = plt.subplots()
sns.heatmap(corrmat)

In [None]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(train[numerical_features], 0.8)
print('Number of correlated features:', len(set(corr_features)))

In [None]:
train = train.drop(corr_features,axis=1)
test = test.drop(corr_features,axis=1)

X_train = train.drop(['price_doc'], axis = 1)
y_train = train['price_doc']
X_test = test
# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test= sc.fit_transform(X_test)

# Target Engineering

In [None]:
sns.histplot(y_train, bins=50, kde=True)

In [None]:
sns.histplot(np.log10(y_train), bins=50, kde=True)

In [None]:
y_train = np.log10(y_train)

# LGBM Model

In the first try I did the LGBM with the parameters below, after the submission, I had an rmse of 0.48122 I tried to improve the model using the Random Grid Search, and I got an RMSE of 0.40039

In [None]:
# The first set of params
params = {
    "objective": "regression",
    "metric": "rmse",
    "num_leaves": 64,
    "learning_rate": 0.01,
    'max_depth': -1,
    'colsample_bytree': 0.9,
    'num_leaves': 150,
    "bagging_seed": 42,
    "verbosity": 1,
    "seed": 42,
}

lgtrain = lgb.Dataset(X_train, label=y_train)
model = lgb.train(params, lgtrain, 5000)
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

print("LightGBM Training Completed...")
# The Submission File => RMSE = 0.48122

In [None]:
# The second set of params
params = {
    "objective": "regression",
    "metric": "rmse",
    'learning_rate': 0.3777518392924809,
    'sub_feature': 0.5424987750103974,
    'max_depth': 94,
    'colsample_bytree': 0.9,
    'num_leaves': 194,
    "bagging_seed": 42,
    'min_data': 31,
    "verbosity": 1,
    "seed": 42,
    'boosting_type': 'dart',
}

lgtrain = lgb.Dataset(X_train, label=y_train)
model = lgb.train(params, lgtrain, 5000)
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

print("LightGBM Training Completed...")
# The Submission File => RMSE = 0.40039

In [None]:

transformed_y_pred = 10 ** y_pred
# Submitting the file
my_submission = pd.DataFrame({'id': df_test.id, 'price_doc': transformed_y_pred})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)
