In [None]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
#pd.set_option('display.max_rows', None)

from tqdm import tqdm, tqdm_notebook

from glob import glob

import re
import gc

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

import plotly.express as px #Plotly Express

from plotly.offline import iplot
#to link plotly to pandas
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline = False, world_readable = True)

plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams['axes.titlesize'] = 16
plt.style.use('seaborn-whitegrid')
sns.set_palette('Set3')

import os
print(os.listdir('../input/secondhand-apartment-price-forecasting-2021/'))

import warnings
warnings.simplefilter('ignore')

from time import time, strftime, gmtime
start = time()
import datetime
print(str(datetime.datetime.now()))

In [None]:
train_column_names = ['ID', 'Type', 'Region', 'City Code', 'Prefecture Name', 'City Name', 'District Name', 
                'Nearest Station: Name', "Nearest station: Distance (minutes)", "Floor plan", "Area (㎡)", 
                "Land shape", "Frontage", "Total floor area (㎡)", "Building year", "Building structure", "Use", 
                "Purpose of future use", "Front road: Direction", "Front road: Type", "Front road: Width (m)", 
                "City planning", "Building coverage ratio (%)", 'Floor area ratio (%)',
                'At the time of transaction', 'Refurbishment', 'Transaction circumstances, etc.', 
                      'Transaction price (total amount) _log']

test_column_names = ['ID', 'Type', 'Region', 'City Code', 'Prefecture Name', 'City Name', 'District Name', 
                'Nearest Station: Name', "Nearest station: Distance (minutes)", "Floor plan", "Area (㎡)", 
                "Land shape", "Frontage", "Total floor area (㎡)", "Building year", "Building structure", "Use", 
                "Purpose of future use", "Front road: Direction", "Front road: Type", "Front road: Width (m)", 
                "City planning", "Building coverage ratio (%)", 'Floor area ratio (%)',
                'At the time of transaction', 'Refurbishment', 'Transaction circumstances, etc.']

In [None]:
base_dir = '../input/secondhand-apartment-price-forecasting-2021/'

In [None]:
test = pd.read_csv(base_dir + 'test.csv')
print(test.shape)
test.columns = test_column_names
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test.head()

In [None]:
sub = pd.read_csv(base_dir + 'sample_submission.csv')
sub

__Combine train dataframes into a single train dataframe__

In [None]:
train = pd.DataFrame()
for path in tqdm_notebook(glob(base_dir + '[0-9][0-9].csv')):
    temp = pd.read_csv(path)
    train = train.append(temp, ignore_index = True)
train.columns = train_column_names
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
print(train.shape)
train.head()

In [None]:
train.describe().T

In [None]:
train.info(), test.info()

In [None]:
missing = train.isna().sum().reset_index()
missing.columns = ['features', 'total_missing']
missing['percent'] = (missing['total_missing'] / len(train)) * 100
missing.index = missing['features']
del missing['features']

missing['total_missing'].iplot(kind = 'bar', 
                               title = 'Missing Values Plot in Trainset',
                               xTitle = 'Features',
                               yTitle = 'Count')
missing.T

In [None]:
test_missing = test.isna().sum().reset_index()
test_missing.columns = ['features', 'total_missing']
test_missing['percent'] = (test_missing['total_missing'] / len(train)) * 100
test_missing.index = test_missing['features']
del test_missing['features']

test_missing['total_missing'].iplot(kind = 'bar', 
                               title = 'Missing Values Plot in Testset',
                               xTitle = 'Features',
                               yTitle = 'Count')
test_missing.T

In [None]:
drop_cols = missing[missing['percent'] > 95].index.tolist()
print(f"Number of features to drop: {len(drop_cols)}")
train.drop(drop_cols, axis = 1, inplace = True)
test.drop(drop_cols, axis = 1, inplace = True)
train.shape, test.shape

- We should drop 'Type' as well as Type has only one value in train

In [None]:
train.drop('Type', axis = 1, inplace = True)
test.drop('Type', axis = 1, inplace = True)
train.shape, test.shape

In [None]:
del missing, test_missing
gc.collect()

In [None]:
sns.distplot(train['Transactionpricetotalamount_log']);

In [None]:
train['Floorplan'].value_counts().sort_values(ascending = True).iplot(kind = 'bar', 
                                                                  orientation = 'h',
                                                                  yTitle = 'Floors',
                                                                  title = 'Countplot of Apartment Floor'
                                                                 )

In [None]:
ax = sns.countplot(data = train, x = 'Refurbishment')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

__Cleaning the Data__

In [None]:
train.columns

In [None]:
train['Area'].unique()

In [None]:
import re

train['Area'] = train['Area'].apply(lambda x: re.sub(r'\D+', '', str(x)))
test['Area'] = test['Area'].apply(lambda x: re.sub(r'\D+', '', str(x)))

train['Area'] = train['Area'].astype('int32')
test['Area'] = test['Area'].astype('int32')

sns.boxplot(data = train, x = 'Area');

In [None]:
plt.scatter(data = train, x = 'Area', y = 'Transactionpricetotalamount_log')
plt.title('Area vs Transaction Price')
plt.xlabel('Area')
plt.ylabel('Transation Price');

- LDK is Living, Dining and Kitchen
- 1 LDK means 1 Bedrooms with Living, Dining rooms and Kitchen
- S is Storage room

In [None]:
train['Floorplan'].unique()

In [None]:
train['Floorplan'] = train['Floorplan'].replace(['オープンフロア', 'メゾネット', 'スタジオ'], np.nan)

In [None]:
train['num_bedrooms'] = train['Floorplan'].apply(lambda x: str(x)[0])
train['num_bedrooms'] = train['num_bedrooms'].apply(lambda x: int(x) if x.isdigit() else np.nan)
train['num_bedrooms'].fillna(train['num_bedrooms'].median(), inplace = True)

test['num_bedrooms'] = test['Floorplan'].apply(lambda x: str(x)[0])
test['num_bedrooms'] = test['num_bedrooms'].apply(lambda x: int(x) if x.isdigit() else np.nan)
test['num_bedrooms'].fillna(test['num_bedrooms'].median(), inplace = True)
train['num_bedrooms'].value_counts(dropna = False), test['num_bedrooms'].value_counts(dropna = False)

In [None]:
train['living'] = train['Floorplan'].apply(lambda x: 1 if 'Ｌ' in str(x) else 0) 
train['dining'] = train['Floorplan'].apply(lambda x: 1 if 'Ｄ' in str(x) else 0) 
train['kitchen'] = train['Floorplan'].apply(lambda x: 1 if 'Ｋ' in str(x) else 0) 
train['storage'] = train['Floorplan'].apply(lambda x: 1 if 'Ｓ' in str(x) else 0)

test['living'] = test['Floorplan'].apply(lambda x: 1 if 'Ｌ' in str(x) else 0) 
test['dining'] = test['Floorplan'].apply(lambda x: 1 if 'Ｄ' in str(x) else 0) 
test['kitchen'] = test['Floorplan'].apply(lambda x: 1 if 'Ｋ' in str(x) else 0) 
test['storage'] = test['Floorplan'].apply(lambda x: 1 if 'Ｓ' in str(x) else 0)

train['living'].unique(), train['storage'].unique()

In [None]:
train.drop('Floorplan', axis = 1, inplace = True)
test.drop('Floorplan', axis = 1, inplace = True)

In [None]:
categorical_features = [c for c in train.columns if train[c].dtype == 'object']
numerical_features = [c for c in train.columns if c not in categorical_features]
numerical_features, categorical_features, len(numerical_features), len(categorical_features)

In [None]:
df = train.sample(100000)
df.shape

In [None]:
corr = df[numerical_features].corr()
print(corr['Transactionpricetotalamount_log'].sort_values(ascending = False))

In [None]:
missing = train.isna().sum().reset_index()
missing['dtype'] = [train[c].dtype for c in missing['index']]
missing = missing[missing[0] > 0]
missing

In [None]:
print('Imputing NaNs of object dtype by most occurances')
for c in missing['index'][missing['dtype'] == 'object']:
    train[c] = train[c].fillna(train[c].value_counts().index[0])
    test[c] = test[c].fillna(train[c].value_counts().index[0])
    
print('Imputing NaNs of float dtype by mean value')
for c in missing['index'][missing['dtype'] == 'float64']:
    train[c] = train[c].fillna(train[c].mean())
    test[c] = test[c].fillna(test[c].mean())

In [None]:
from scipy.stats import skew

num_feats = train.dtypes[train.dtypes != 'object'].index

# Check how skewed they are
skewed_feats = train[num_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending = False)

plt.subplots(figsize = (30, 20))
skewed_feats.plot(kind = 'bar', rot = 0);

__Standardize Numerical Features and Label Encode Categorical Features__

In [None]:
numerical_features.remove('ID')
numerical_features.remove('Transactionpricetotalamount_log')
train.drop(['ID'], axis = 1, inplace = True)
test.drop(['ID'], axis = 1, inplace = True)

In [None]:
scl = StandardScaler()
train[numerical_features] = scl.fit_transform(train[numerical_features])
test[numerical_features] = scl.transform(test[numerical_features])

In [None]:
lbl = LabelEncoder()
for c in categorical_features:
    lbl.fit(list(train[c].astype(str).values) + list(test[c].astype(str).values))
    train[c] = lbl.transform(list(train[c].astype(str).values))
    test[c] = lbl.transform(list(test[c].astype(str).values))
print('Label Encoding Categorical Features done..')

- Check if the most recurrent value of the feature is repeated almost in all the instances. If it does then it drops these features because their values are almost the same for all instances and will not help in the learning process.

In [None]:
repeated = []
for c in df.columns:
    counts = df[c].value_counts()
    zeros = counts.iloc[0]
    if zeros / len(df) * 100 > 99.94:
        repeated.append(c)
repeated = list(repeated)
if repeated:
    print(f"These are the features with same repeated value: {repeated}")
    train = train.drop(repeated, axis = 1)
    test = test.drop(repeated, axis = 1)
else:
    print('No repeated values in columns')

In [None]:
target = train['Transactionpricetotalamount_log'].copy()
train.drop(['Transactionpricetotalamount_log'], axis = 1, inplace = True)

In [None]:
Xtrain, Xvalid, ytrain, yvalid = train_test_split(train, target, test_size = 0.2, random_state = 2021)
print(Xtrain.shape, ytrain.shape, Xvalid.shape, yvalid.shape)

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor(
               objective = 'regression', 
               num_leaves = 4,
               learning_rate = 0.01, 
               n_estimators = 10000,
               max_bin = 200, 
               bagging_fraction = 0.75,
               bagging_freq = 5, 
               bagging_seed = 7,
               feature_fraction = 0.2,
               feature_fraction_seed = 7,
               verbose = 1,
            )

lgbm_model = lgbm.fit(Xtrain, ytrain)
lg_vpreds = lgbm_model.predict(Xvalid)
print((f"LGBM MAE: {mean_absolute_error(yvalid, lg_vpreds)}"))

In [None]:
lg_preds = lgbm_model.predict(test)
sub['取引価格（総額）_log'] = lg_preds
sub.to_csv('sub_lgbm.csv', index = False)
sub.head()

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(
                    learning_rate = 0.01, 
                    n_estimators = 10000,
                    max_depth = 3, 
                    min_child_weight = 0,
                    gamma = 0, 
                    subsample = 0.7,
                    colsample_bytree = 0.7,
                    objective = 'reg:squarederror', 
                    nthread = 1,
                    scale_pos_weight = 1, 
                    seed = 27,
                    reg_alpha = 0.00006
                    )
xgb_model = xgb.fit(Xtrain, ytrain)
xg_vpreds = xgb_model.predict(Xvalid)
print((f"XGBOOST MAE: {mean_absolute_error(yvalid, xg_vpreds)}"))

In [None]:
xg_preds = xgb_model.predict(test)
sub['取引価格（総額）_log'] = xg_preds
sub.to_csv('sub_xg.csv', index = False)
sub.head()

In [None]:
sub['取引価格（総額）_log'] = (lg_preds + xg_preds) / 2
sub.to_csv('sub_en.csv', index = False)
sub.head()

In [None]:
from catboost import CatBoostRegressor

cat = CatBoostRegressor(
                iterations = 10000,
                learning_rate = 0.01,
                depth = 6,
                l2_leaf_reg = 3,
                loss_function = 'MAE',
                eval_metric = 'MAE',
                random_seed = 2021)

cat_model = cat.fit(Xtrain, ytrain)
cat_vpreds = cat_model.predict(Xvalid)

In [None]:
print((f"CatBoost MAE: {mean_absolute_error(yvalid, cat_vpreds)}"))
cat_preds = cat_model.predict(test)
sub['取引価格（総額）_log'] = cat_preds
sub.to_csv('sub_cat.csv', index = False)
sub.head()

In [None]:
sub['取引価格（総額）_log'] = (lg_preds + xg_preds + cat_preds) / 3
sub.to_csv('sub_en2.csv', index = False)
sub.head()

In [None]:
finish = time()
print(strftime("%H:%M:%S", gmtime(finish - start)))