In [1]:
# for text processing
!pip -q install hazm
!pip -q install clean-text[gpl]
# eda
!pip install -U -q dataprep

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
hazm 0.7.0 requires nltk==3.3, but you have nltk 3.7 which is incompatible.[0m


In [2]:
# basic imports
import os
import gc, json
import numpy as np
import pandas as pd
from tqdm import tqdm
import plotly.graph_objects as go
from dataprep.eda import create_report
from IPython.display import display, HTML

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer, SimpleImputer

# for text cleaning
from hazm import Normalizer
from hazm import WordTokenizer
from cleantext import clean
import re

In [3]:
# concent to Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
# unzip dataset
!unzip /content/drive/MyDrive/review-datasets.zip -d /content
data_path = '/content/datasets/'

Archive:  /content/drive/MyDrive/review-datasets.zip
replace /content/datasets/reject_reasons_info.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/datasets/future_test.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/datasets/train.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [5]:
# load training data
train_df = pd.read_parquet(f'{data_path}train.parquet', engine='pyarrow')
print('number of examples in trainset: ', len(train_df))
train_df.head()

number of examples in trainset:  102371


Unnamed: 0,interview_post,reject_reason_id,reject_reason_category,review_label
0,"{""category"": ""apartment-sell"", ""description"": ...",0,accept,accept
1,"{""category"": ""apartment-sell"", ""description"": ...",153,wrong_price,reject
2,"{""category"": ""apartment-sell"", ""description"": ...",150,one_on_one,reject
3,"{""category"": ""apartment-sell"", ""description"": ...",150,one_on_one,reject
4,"{""category"": ""apartment-sell"", ""description"": ...",0,accept,accept


In [6]:
# load test data: two compare avaliable features
# - we only see train_df but apply all transformation on both train_df nad test_df
test_df = pd.read_parquet(f'{data_path}future_test.parquet', engine='pyarrow')
print('number of examples in trainset: ', len(test_df))
test_df.head()

number of examples in trainset:  48877


Unnamed: 0,interview_post,reject_reason_id,reject_reason_category,review_label
0,"{""category"": ""apartment-sell"", ""description"": ...",0,accept,accept
1,"{""category"": ""apartment-sell"", ""description"": ...",0,accept,accept
2,"{""category"": ""apartment-sell"", ""description"": ...",0,accept,accept
3,"{""category"": ""apartment-sell"", ""description"": ...",0,accept,accept
4,"{""category"": ""apartment-sell"", ""description"": ...",0,accept,accept


In [7]:
# healper funcion
def plot_distro(column, title='calsses'):
    fig = go.Figure()
    freq = column.value_counts()
    fig.add_trace(go.Bar(y=freq, x=freq.index.to_numpy().astype('str')))
    fig.update_layout(title_text=f'Distribution of {title}',
                      xaxis_title_text=title,
                      yaxis_title_text='Frequency',
                      bargap=0.2,
                      bargroupgap=0.2)
    return fig

# Plot Distribution of target variable
plot_distro(train_df.review_label).show()

In [8]:
plot_distro(train_df.reject_reason_category, title='reject reasons').show()

In [9]:
# take a look at reject reason descriptions
reason_df = pd.read_csv(f'{data_path}reject_reasons_info.csv', index_col=0)
print('# unique category: ', len(train_df.reject_reason_category.unique()))
print('# unique id: ', len(train_df.reject_reason_id.unique()))
# observation: 150 and 4664631 are the same (one_on_one)
# - after chechking some examples of both 
reason_df

# unique category:  7
# unique id:  8


Unnamed: 0,reject_reason_id,reject_reason_category,reject_reason_desc
0,0,accept,
1,146,wrong_category,دسته‌بندی اشتباه است
2,150,one_on_one,لطفا در هر آگهی مشخصات یک ملک را وارد کنید. هم...
3,151,wrong_city,شهر انتخاب شده اشتباه است؛ لطفا با جست‌وجو در ...
4,153,wrong_price,قیمت وارد شده اشتباه است. فیلد قیمت باید با قی...
5,161,critics_and_suggestions,در آگهی شما ملک یا خدماتی مشخص نشده است؛ لطفاً...
6,4664631,one_on_one,عنوان و توضیحات آگهی باید به طور مشخص به یک مل...
7,10521828,buy_add,امکان انتشار آگهی درخواست ملک وجود ندارد. با ا...


In [10]:
# parse the interview_post column to a pd.DataFrame 
train_feature_df = pd.json_normalize(train_df.interview_post.apply(lambda x: json.loads(x)))
test_feature_df = pd.json_normalize(test_df.interview_post.apply(lambda x: json.loads(x)))
print('# number of features: ', len(train_feature_df.columns))
# add parsed features to dataframes and remove interview_post column (unparsed features)
train_df = pd.concat([train_feature_df, train_df.drop(['interview_post'], axis=1)], axis=1)
test_df = pd.concat([test_feature_df, test_df.drop(['interview_post'], axis=1)], axis=1)

# number of features:  31


In [11]:
# check if any of the columns has the same value across all examples
redundant_columns = []
for col in train_df.columns:
    col_varibility = train_df[col].unique()
    if len(col_varibility) == 1: 
        print(f'{col} column only has {col_varibility}')
        redundant_columns.append(col)

# delete those columns
train_df = train_df.drop(redundant_columns, axis=1)
test_df = test_df.drop(redundant_columns, axis=1)

category column only has ['apartment-sell']


In [12]:
# we remove features that aren't available at test time
no_test_time = set(train_df.columns) - set(test_df.columns)
print("These features aren't available at inference:", no_test_time)

train_df = train_df.drop(no_test_time, axis=1)

These features aren't available at inference: {'exchange', 'post_type'}


In [13]:
# let's take a look at new columns
# obserevation: description and title are free-form text
train_df.head()

Unnamed: 0,description,elevator,floor,parking,rooms,size,title,user_type,warehouse,year,...,price.mode,new_price,price.value,other_options_and_attributes.toilet,zoonkan_enabled,national_id,desc,reject_reason_id,reject_reason_category,review_label
0,سلام\nواحد تک خواب تمیز\nسند تک برگ آماده وام ...,True,0,True,یک,60,آپارتمان نقلی، سند ششدانگ,شخصی,True,۱۳۸۸,...,توافقی,,,,,,,0,accept,accept
1,⭐⭐⭐بسم الله الرحمن الرحیم ⭐⭐⭐\n\n⚜املاک فردیس ...,True,2,True,دو,69,۶۹متر واحد قریشی شمالی طبقه دوم,مشاور املاک,True,۱۳۹۱,...,مقطوع,300000000.0,300000000.0,,,,,153,wrong_price,reject
2,با سلام \nتعدادی واحد در متراژ های ۹۰ و ۱۲۰ مت...,True,5,True,دو,120,آپارتمان ۱۲۰ متری مسکن خیابان مطهری,شخصی,True,۱۳۹۹,...,مقطوع,1000000000.0,1000000000.0,,,,,150,one_on_one,reject
3,پیش فروش ۲ واحد آپارتمان تک واحدی \n\n۳ خواب \...,True,2,True,سه,135,۱۳۵ متر پیش فروش فول امکانات تک واحدی,مشاور املاک,True,۱۳۹۹,...,مقطوع,2497500000.0,2497500000.0,squat_seat,,,,150,one_on_one,reject
4,فروش فوری آپارتمان در گلسار بلوار سمیه \n\n⭕️...,True,5,True,دو,92,فروش آپارتمان گلسار بلوار سمیه ٩٢ متری,مشاور املاک,True,۱۳۹۲,...,مقطوع,1900000000.0,1900000000.0,,,,,0,accept,accept


Automatic EDA: The following creates an automatic EDA report 
- w/o considering 'description' and 'title' columns
- the conclusions in the next cells are based on observation of this report

In [45]:
display(HTML('https://raw.githubusercontent.com/sajjjadayobi/utils/main/src/divar_report.html'))

0,1
Number of Variables,29
Number of Rows,102371
Missing Cells,1.1742e+06
Missing Cells (%),39.6%
Duplicate Rows,12736
Duplicate Rows (%),12.4%
Total Size in Memory,133.1 MB
Average Row Size in Memory,1.3 KB
Variable Types,Categorical: 25  Numerical: 4

0,1
new_price and price.value have similar distributions,Similar Distribution
elevator has 3948 (3.86%) missing values,Missing
floor has 3948 (3.86%) missing values,Missing
parking has 3948 (3.86%) missing values,Missing
warehouse has 3948 (3.86%) missing values,Missing
other_options_and_attributes.balcony has 65882 (64.36%) missing values,Missing
other_options_and_attributes.building_direction has 76843 (75.06%) missing values,Missing
other_options_and_attributes.cooling_system has 76204 (74.44%) missing values,Missing
other_options_and_attributes.deed_type has 73595 (71.89%) missing values,Missing
other_options_and_attributes.floor_type has 67639 (66.07%) missing values,Missing

0,1
other_options_and_attributes.floors_count has 67668 (66.1%) missing values,Missing
other_options_and_attributes.heating_system has 73452 (71.75%) missing values,Missing
other_options_and_attributes.rebuilt has 92078 (89.95%) missing values,Missing
other_options_and_attributes.unit_per_floor has 68101 (66.52%) missing values,Missing
other_options_and_attributes.warm_water_provider has 72452 (70.77%) missing values,Missing
new_price has 30475 (29.77%) missing values,Missing
price.value has 30475 (29.77%) missing values,Missing
other_options_and_attributes.toilet has 68940 (67.34%) missing values,Missing
zoonkan_enabled has 96237 (94.01%) missing values,Missing
national_id has 95993 (93.77%) missing values,Missing

0,1
desc has 102362 (99.99%) missing values,Missing
size is skewed,Skewed
location.city is skewed,Skewed
new_price is skewed,Skewed
price.value is skewed,Skewed
Dataset has 12736 (12.44%) duplicate rows,Duplicates
national_id has a high cardinality: 4784 distinct values,High Cardinality
national_id has constant length 10,Constant Length

0,1
Approximate Distinct Count,2
Approximate Unique (%),0.0%
Missing,3948
Missing (%),3.9%
Memory Size,6.5 MB

0,1
Mean,4.2633
Standard Deviation,0.4404
Median,4.0
Minimum,4.0
Maximum,5.0

0,1
1st row,True
2nd row,True
3rd row,True
4th row,True
5th row,True

0,1
Count,419602
Lowercase Letter,419602
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,33
Approximate Unique (%),0.0%
Missing,3948
Missing (%),3.9%
Memory Size,6.2 MB

0,1
Mean,1.0377
Standard Deviation,0.2028
Median,1.0
Minimum,1.0
Maximum,3.0

0,1
1st row,0
2nd row,2
3rd row,5
4th row,2
5th row,5

0,1
Count,0
Lowercase Letter,0
Space Separator,0
Uppercase Letter,0
Dash Punctuation,307
Decimal Number,101582

0,1
Approximate Distinct Count,2
Approximate Unique (%),0.0%
Missing,3948
Missing (%),3.9%
Memory Size,6.5 MB

0,1
Mean,4.1565
Standard Deviation,0.3634
Median,4.0
Minimum,4.0
Maximum,5.0

0,1
1st row,True
2nd row,True
3rd row,True
4th row,True
5th row,True

0,1
Count,409099
Lowercase Letter,409099
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,6
Approximate Unique (%),0.0%
Missing,0
Missing (%),0.0%
Memory Size,9.7 MB

0,1
Mean,2.2253
Standard Deviation,1.2946
Median,2.0
Minimum,2.0
Maximum,12.0

0,1
1st row,یک
2nd row,دو
3rd row,دو
4th row,سه
5th row,دو

0,1
Count,0
Lowercase Letter,0
Space Separator,3385
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,761
Approximate Unique (%),0.7%
Missing,0
Missing (%),0.0%
Infinite,0
Infinite (%),0.0%
Memory Size,1.6 MB
Mean,850.4538
Minimum,1
Maximum,10000000

0,1
Minimum,1.0
5-th Percentile,50.0
Q1,75.0
Median,100.0
Q3,130.0
95-th Percentile,230.75
Maximum,10000000.0
Range,9999999.0
IQR,55.0

0,1
Mean,850.4538
Standard Deviation,58860.5674
Variance,3464600000.0
Sum,87062000.0
Skewness,138.5702
Kurtosis,21202.4428
Coefficient of Variation,69.2108

0,1
Approximate Distinct Count,2
Approximate Unique (%),0.0%
Missing,16
Missing (%),0.0%
Memory Size,11.8 MB

0,1
Mean,7.5692
Standard Deviation,3.4993
Median,11.0
Minimum,4.0
Maximum,11.0

0,1
1st row,شخصی
2nd row,مشاور املاک
3rd row,شخصی
4th row,مشاور املاک
5th row,مشاور املاک

0,1
Count,0
Lowercase Letter,0
Space Separator,52189
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,2
Approximate Unique (%),0.0%
Missing,3948
Missing (%),3.9%
Memory Size,6.5 MB

0,1
Mean,4.1336
Standard Deviation,0.3402
Median,4.0
Minimum,4.0
Maximum,5.0

0,1
1st row,True
2nd row,True
3rd row,True
4th row,True
5th row,True

0,1
Count,406841
Lowercase Letter,406841
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,30
Approximate Unique (%),0.0%
Missing,4
Missing (%),0.0%
Memory Size,10.5 MB

0,1
Mean,4.0492
Standard Deviation,0.585
Median,4.0
Minimum,4.0
Maximum,11.0

0,1
1st row,۱۳۸۸
2nd row,۱۳۹۱
3rd row,۱۳۹۹
4th row,۱۳۹۹
5th row,۱۳۹۲

0,1
Count,0
Lowercase Letter,0
Space Separator,1440
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,225
Approximate Unique (%),0.2%
Missing,0
Missing (%),0.0%
Infinite,0
Infinite (%),0.0%
Memory Size,1.6 MB
Mean,75.9235
Minimum,1
Maximum,1738

0,1
Minimum,1
5-th Percentile,1
Q1,1
Median,3
Q3,12
95-th Percentile,746
Maximum,1738
Range,1737
IQR,11

0,1
Mean,75.9235
Standard Deviation,234.556
Variance,55016.5214
Sum,7772400.0
Skewness,3.643
Kurtosis,14.5055
Coefficient of Variation,3.0894

0,1
Approximate Distinct Count,2
Approximate Unique (%),0.0%
Missing,65882
Missing (%),64.4%
Memory Size,2.4 MB

0,1
Mean,4.0708
Standard Deviation,0.2564
Median,4.0
Minimum,4.0
Maximum,5.0

0,1
1st row,True
2nd row,True
3rd row,True
4th row,True
5th row,True

0,1
Count,148538
Lowercase Letter,148538
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,4
Approximate Unique (%),0.0%
Missing,76843
Missing (%),75.1%
Memory Size,1.7 MB

0,1
Mean,4.9354
Standard Deviation,0.2457
Median,5.0
Minimum,4.0
Maximum,5.0

0,1
1st row,south
2nd row,south
3rd row,south
4th row,south
5th row,north

0,1
Count,125992
Lowercase Letter,125992
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,5
Approximate Unique (%),0.0%
Missing,76204
Missing (%),74.4%
Memory Size,1.9 MB

0,1
Mean,11.2964
Standard Deviation,2.7095
Median,12.0
Minimum,5.0
Maximum,15.0

0,1
1st row,water_cooler
2nd row,water_cooler
3rd row,split
4th row,water_cooler
5th row,air_conditioner

0,1
Count,272504
Lowercase Letter,272504
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,3
Approximate Unique (%),0.0%
Missing,73595
Missing (%),71.9%
Memory Size,2.3 MB

0,1
Mean,19.2547
Standard Deviation,2.4417
Median,20.0
Minimum,10.0
Maximum,20.0

0,1
1st row,official_single_pa...
2nd row,official_single_pa...
3rd row,official_single_pa...
4th row,official_single_pa...
5th row,official_single_pa...

0,1
Count,500042
Lowercase Letter,500042
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,7
Approximate Unique (%),0.0%
Missing,67639
Missing (%),66.1%
Memory Size,2.4 MB

0,1
Mean,7.2786
Standard Deviation,2.0099
Median,7.0
Minimum,5.0
Maximum,16.0

0,1
1st row,carpet
2nd row,ceramic
3rd row,ceramic
4th row,ceramic
5th row,ceramic

0,1
Count,250152
Lowercase Letter,250152
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,30
Approximate Unique (%),0.1%
Missing,67668
Missing (%),66.1%
Memory Size,2.2 MB

0,1
Mean,1.062
Standard Deviation,0.2481
Median,1.0
Minimum,1.0
Maximum,3.0

0,1
1st row,3
2nd row,4
3rd row,4
4th row,3
5th row,4

0,1
Count,0
Lowercase Letter,0
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,36793

0,1
Approximate Distinct Count,7
Approximate Unique (%),0.0%
Missing,73452
Missing (%),71.8%
Memory Size,2.0 MB

0,1
Mean,7.1503
Standard Deviation,1.4106
Median,7.0
Minimum,5.0
Maximum,13.0

0,1
1st row,heater
2nd row,shoofaj
3rd row,shoofaj
4th row,shoofaj
5th row,fireplace

0,1
Count,203427
Lowercase Letter,203427
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,2
Approximate Unique (%),0.0%
Missing,92078
Missing (%),90.0%
Memory Size,693.6 KB

0,1
Mean,4.0058
Standard Deviation,0.07613
Median,4.0
Minimum,4.0
Maximum,5.0

0,1
1st row,True
2nd row,True
3rd row,True
4th row,True
5th row,True

0,1
Count,41232
Lowercase Letter,30939
Space Separator,0
Uppercase Letter,10293
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,9
Approximate Unique (%),0.0%
Missing,68101
Missing (%),66.5%
Memory Size,2.2 MB

0,1
Mean,1.1144
Standard Deviation,1.0634
Median,1.0
Minimum,1.0
Maximum,11.0

0,1
1st row,3
2nd row,1
3rd row,4
4th row,2
5th row,2

0,1
Count,3136
Lowercase Letter,3136
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,34270

0,1
Approximate Distinct Count,3
Approximate Unique (%),0.0%
Missing,72452
Missing (%),70.8%
Memory Size,2.1 MB

0,1
Mean,8.5823
Standard Deviation,2.2273
Median,7.0
Minimum,7.0
Maximum,12.0

0,1
1st row,water_heater
2nd row,package
3rd row,package
4th row,package
5th row,package

0,1
Count,248643
Lowercase Letter,248643
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,2
Approximate Unique (%),0.0%
Missing,0
Missing (%),0.0%
Memory Size,11.0 MB

0,1
Mean,5.2977
Standard Deviation,0.4572
Median,5.0
Minimum,5.0
Maximum,6.0

0,1
1st row,توافقی
2nd row,مقطوع
3rd row,مقطوع
4th row,مقطوع
5th row,مقطوع

0,1
Count,0
Lowercase Letter,0
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,4139
Approximate Unique (%),5.8%
Missing,30475
Missing (%),29.8%
Infinite,0
Infinite (%),0.0%
Memory Size,1.1 MB
Mean,3.0007e+09
Minimum,200000
Maximum,5e+11

0,1
Minimum,200000.0
5-th Percentile,250000000.0
Q1,600000000.0
Median,1122000000.0
Q3,2290000000.0
95-th Percentile,8000000000.0
Maximum,500000000000.0
Range,500000000000.0
IQR,1690000000.0

0,1
Mean,3000700000.0
Standard Deviation,14545000000.0
Variance,2.1154e+20
Sum,215740000000000.0
Skewness,22.604
Kurtosis,646.8588
Coefficient of Variation,4.847

0,1
Approximate Distinct Count,4139
Approximate Unique (%),5.8%
Missing,30475
Missing (%),29.8%
Infinite,0
Infinite (%),0.0%
Memory Size,1.1 MB
Mean,3.0007e+09
Minimum,200000
Maximum,5e+11

0,1
Minimum,200000.0
5-th Percentile,250000000.0
Q1,600000000.0
Median,1122000000.0
Q3,2290000000.0
95-th Percentile,8000000000.0
Maximum,500000000000.0
Range,500000000000.0
IQR,1690000000.0

0,1
Mean,3000700000.0
Standard Deviation,14545000000.0
Variance,2.1154e+20
Sum,215740000000000.0
Skewness,22.604
Kurtosis,646.8588
Coefficient of Variation,4.847

0,1
Approximate Distinct Count,3
Approximate Unique (%),0.0%
Missing,68940
Missing (%),67.3%
Memory Size,2.3 MB

0,1
Mean,7.8929
Standard Deviation,2.4949
Median,10.0
Minimum,4.0
Maximum,10.0

0,1
1st row,squat_seat
2nd row,squat_seat
3rd row,squat
4th row,squat_seat
5th row,squat_seat

0,1
Count,244381
Lowercase Letter,244381
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,2
Approximate Unique (%),0.0%
Missing,96237
Missing (%),94.0%
Memory Size,417.2 KB

0,1
Mean,4.6506
Standard Deviation,0.4768
Median,5.0
Minimum,4.0
Maximum,5.0

0,1
1st row,True
2nd row,False
3rd row,False
4th row,True
5th row,False

0,1
Count,28527
Lowercase Letter,22393
Space Separator,0
Uppercase Letter,6134
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,4784
Approximate Unique (%),75.0%
Missing,95993
Missing (%),93.8%
Memory Size,595.5 KB

0,1
Mean,10
Standard Deviation,0
Median,10
Minimum,10
Maximum,10

0,1
1st row,0374567835
2nd row,۰۰۶۴۴۱۶۸۱۱
3rd row,2909811182
4th row,0013232312
5th row,2500118649

0,1
Count,0
Lowercase Letter,0
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,40304

0,1
Approximate Distinct Count,8
Approximate Unique (%),88.9%
Missing,102362
Missing (%),100.0%
Memory Size,21.7 KB

0,1
Mean,636.8889
Standard Deviation,270.3518
Median,837.0
Minimum,222.0
Maximum,930.0

0,1
1st row,به نام خدا آپار...
2nd row,سلام. یک واحدآپارت...
3rd row,بهترین فرصت برای ا...
4th row,سلام. یک واحدآپارت...
5th row,یکواحد آپارتمان دو...

0,1
Count,26
Lowercase Letter,18
Space Separator,1075
Uppercase Letter,8
Dash Punctuation,3
Decimal Number,63

0,1
Approximate Distinct Count,8
Approximate Unique (%),0.0%
Missing,0
Missing (%),0.0%
Memory Size,6.6 MB

0,1
Mean,2.7592
Standard Deviation,2.3872
Median,1.0
Minimum,1.0
Maximum,8.0

0,1
1st row,0
2nd row,153
3rd row,150
4th row,150
5th row,0

0,1
Count,0
Lowercase Letter,0
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,282457

0,1
Approximate Distinct Count,7
Approximate Unique (%),0.0%
Missing,0
Missing (%),0.0%
Memory Size,7.1 MB

0,1
Mean,7.9702
Standard Deviation,2.6571
Median,6.0
Minimum,6.0
Maximum,23.0

0,1
1st row,accept
2nd row,wrong_price
3rd row,one_on_one
4th row,one_on_one
5th row,accept

0,1
Count,743985
Lowercase Letter,743985
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,3
Approximate Unique (%),0.0%
Missing,0
Missing (%),0.0%
Memory Size,7.0 MB

0,1
Mean,6.2289
Standard Deviation,1.045
Median,6.0
Minimum,6.0
Maximum,11.0

0,1
1st row,accept
2nd row,reject
3rd row,reject
4th row,reject
5th row,accept

0,1
Count,632970
Lowercase Letter,632970
Space Separator,0
Uppercase Letter,0
Dash Punctuation,4686
Decimal Number,0


In [15]:
report = create_report(train_df.drop(['description', 'title'], axis=1))
report.save('divar_report.html')
report.show()

ERROR:bokeh.core.validation.check:E-1019 (DUPLICATE_FACTORS): FactorRange must specify a unique list of categorical factors for an axis: duplicate factors found: 'other_options...tem', 'other_options...ype'
ERROR:bokeh.core.validation.check:E-1019 (DUPLICATE_FACTORS): FactorRange must specify a unique list of categorical factors for an axis: duplicate factors found: 'other_options...tem', 'other_options...ype'
ERROR:bokeh.core.validation.check:E-1019 (DUPLICATE_FACTORS): FactorRange must specify a unique list of categorical factors for an axis: duplicate factors found: 'other_options...tem', 'other_options...ype'
The plot will not show in a notebook environment, please try 'show_browser' if you want to open it in browser


Report has been saved to divar_report.html!


0,1
Number of Variables,29
Number of Rows,102371
Missing Cells,1.1742e+06
Missing Cells (%),39.6%
Duplicate Rows,12736
Duplicate Rows (%),12.4%
Total Size in Memory,133.1 MB
Average Row Size in Memory,1.3 KB
Variable Types,Categorical: 25  Numerical: 4

0,1
new_price and price.value have similar distributions,Similar Distribution
elevator has 3948 (3.86%) missing values,Missing
floor has 3948 (3.86%) missing values,Missing
parking has 3948 (3.86%) missing values,Missing
warehouse has 3948 (3.86%) missing values,Missing
other_options_and_attributes.balcony has 65882 (64.36%) missing values,Missing
other_options_and_attributes.building_direction has 76843 (75.06%) missing values,Missing
other_options_and_attributes.cooling_system has 76204 (74.44%) missing values,Missing
other_options_and_attributes.deed_type has 73595 (71.89%) missing values,Missing
other_options_and_attributes.floor_type has 67639 (66.07%) missing values,Missing

0,1
other_options_and_attributes.floors_count has 67668 (66.1%) missing values,Missing
other_options_and_attributes.heating_system has 73452 (71.75%) missing values,Missing
other_options_and_attributes.rebuilt has 92078 (89.95%) missing values,Missing
other_options_and_attributes.unit_per_floor has 68101 (66.52%) missing values,Missing
other_options_and_attributes.warm_water_provider has 72452 (70.77%) missing values,Missing
new_price has 30475 (29.77%) missing values,Missing
price.value has 30475 (29.77%) missing values,Missing
other_options_and_attributes.toilet has 68940 (67.34%) missing values,Missing
zoonkan_enabled has 96237 (94.01%) missing values,Missing
national_id has 95993 (93.77%) missing values,Missing

0,1
desc has 102362 (99.99%) missing values,Missing
size is skewed,Skewed
location.city is skewed,Skewed
new_price is skewed,Skewed
price.value is skewed,Skewed
Dataset has 12736 (12.44%) duplicate rows,Duplicates
national_id has a high cardinality: 4784 distinct values,High Cardinality
national_id has constant length 10,Constant Length

0,1
Approximate Distinct Count,2
Approximate Unique (%),0.0%
Missing,3948
Missing (%),3.9%
Memory Size,6.5 MB

0,1
Mean,4.2633
Standard Deviation,0.4404
Median,4.0
Minimum,4.0
Maximum,5.0

0,1
1st row,True
2nd row,True
3rd row,True
4th row,True
5th row,True

0,1
Count,419602
Lowercase Letter,419602
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,33
Approximate Unique (%),0.0%
Missing,3948
Missing (%),3.9%
Memory Size,6.2 MB

0,1
Mean,1.0377
Standard Deviation,0.2028
Median,1.0
Minimum,1.0
Maximum,3.0

0,1
1st row,0
2nd row,2
3rd row,5
4th row,2
5th row,5

0,1
Count,0
Lowercase Letter,0
Space Separator,0
Uppercase Letter,0
Dash Punctuation,307
Decimal Number,101582

0,1
Approximate Distinct Count,2
Approximate Unique (%),0.0%
Missing,3948
Missing (%),3.9%
Memory Size,6.5 MB

0,1
Mean,4.1565
Standard Deviation,0.3634
Median,4.0
Minimum,4.0
Maximum,5.0

0,1
1st row,True
2nd row,True
3rd row,True
4th row,True
5th row,True

0,1
Count,409099
Lowercase Letter,409099
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,6
Approximate Unique (%),0.0%
Missing,0
Missing (%),0.0%
Memory Size,9.7 MB

0,1
Mean,2.2253
Standard Deviation,1.2946
Median,2.0
Minimum,2.0
Maximum,12.0

0,1
1st row,یک
2nd row,دو
3rd row,دو
4th row,سه
5th row,دو

0,1
Count,0
Lowercase Letter,0
Space Separator,3385
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,761
Approximate Unique (%),0.7%
Missing,0
Missing (%),0.0%
Infinite,0
Infinite (%),0.0%
Memory Size,1.6 MB
Mean,850.4538
Minimum,1
Maximum,10000000

0,1
Minimum,1.0
5-th Percentile,50.0
Q1,75.0
Median,100.0
Q3,130.0
95-th Percentile,230.75
Maximum,10000000.0
Range,9999999.0
IQR,55.0

0,1
Mean,850.4538
Standard Deviation,58860.5674
Variance,3464600000.0
Sum,87062000.0
Skewness,138.5702
Kurtosis,21202.4428
Coefficient of Variation,69.2108

0,1
Approximate Distinct Count,2
Approximate Unique (%),0.0%
Missing,16
Missing (%),0.0%
Memory Size,11.8 MB

0,1
Mean,7.5692
Standard Deviation,3.4993
Median,11.0
Minimum,4.0
Maximum,11.0

0,1
1st row,شخصی
2nd row,مشاور املاک
3rd row,شخصی
4th row,مشاور املاک
5th row,مشاور املاک

0,1
Count,0
Lowercase Letter,0
Space Separator,52189
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,2
Approximate Unique (%),0.0%
Missing,3948
Missing (%),3.9%
Memory Size,6.5 MB

0,1
Mean,4.1336
Standard Deviation,0.3402
Median,4.0
Minimum,4.0
Maximum,5.0

0,1
1st row,True
2nd row,True
3rd row,True
4th row,True
5th row,True

0,1
Count,406841
Lowercase Letter,406841
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,30
Approximate Unique (%),0.0%
Missing,4
Missing (%),0.0%
Memory Size,10.5 MB

0,1
Mean,4.0492
Standard Deviation,0.585
Median,4.0
Minimum,4.0
Maximum,11.0

0,1
1st row,۱۳۸۸
2nd row,۱۳۹۱
3rd row,۱۳۹۹
4th row,۱۳۹۹
5th row,۱۳۹۲

0,1
Count,0
Lowercase Letter,0
Space Separator,1440
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,225
Approximate Unique (%),0.2%
Missing,0
Missing (%),0.0%
Infinite,0
Infinite (%),0.0%
Memory Size,1.6 MB
Mean,75.9235
Minimum,1
Maximum,1738

0,1
Minimum,1
5-th Percentile,1
Q1,1
Median,3
Q3,12
95-th Percentile,746
Maximum,1738
Range,1737
IQR,11

0,1
Mean,75.9235
Standard Deviation,234.556
Variance,55016.5214
Sum,7772400.0
Skewness,3.643
Kurtosis,14.5055
Coefficient of Variation,3.0894

0,1
Approximate Distinct Count,2
Approximate Unique (%),0.0%
Missing,65882
Missing (%),64.4%
Memory Size,2.4 MB

0,1
Mean,4.0708
Standard Deviation,0.2564
Median,4.0
Minimum,4.0
Maximum,5.0

0,1
1st row,True
2nd row,True
3rd row,True
4th row,True
5th row,True

0,1
Count,148538
Lowercase Letter,148538
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,4
Approximate Unique (%),0.0%
Missing,76843
Missing (%),75.1%
Memory Size,1.7 MB

0,1
Mean,4.9354
Standard Deviation,0.2457
Median,5.0
Minimum,4.0
Maximum,5.0

0,1
1st row,south
2nd row,south
3rd row,south
4th row,south
5th row,north

0,1
Count,125992
Lowercase Letter,125992
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,5
Approximate Unique (%),0.0%
Missing,76204
Missing (%),74.4%
Memory Size,1.9 MB

0,1
Mean,11.2964
Standard Deviation,2.7095
Median,12.0
Minimum,5.0
Maximum,15.0

0,1
1st row,water_cooler
2nd row,water_cooler
3rd row,split
4th row,water_cooler
5th row,air_conditioner

0,1
Count,272504
Lowercase Letter,272504
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,3
Approximate Unique (%),0.0%
Missing,73595
Missing (%),71.9%
Memory Size,2.3 MB

0,1
Mean,19.2547
Standard Deviation,2.4417
Median,20.0
Minimum,10.0
Maximum,20.0

0,1
1st row,official_single_pa...
2nd row,official_single_pa...
3rd row,official_single_pa...
4th row,official_single_pa...
5th row,official_single_pa...

0,1
Count,500042
Lowercase Letter,500042
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,7
Approximate Unique (%),0.0%
Missing,67639
Missing (%),66.1%
Memory Size,2.4 MB

0,1
Mean,7.2786
Standard Deviation,2.0099
Median,7.0
Minimum,5.0
Maximum,16.0

0,1
1st row,carpet
2nd row,ceramic
3rd row,ceramic
4th row,ceramic
5th row,ceramic

0,1
Count,250152
Lowercase Letter,250152
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,30
Approximate Unique (%),0.1%
Missing,67668
Missing (%),66.1%
Memory Size,2.2 MB

0,1
Mean,1.062
Standard Deviation,0.2481
Median,1.0
Minimum,1.0
Maximum,3.0

0,1
1st row,3
2nd row,4
3rd row,4
4th row,3
5th row,4

0,1
Count,0
Lowercase Letter,0
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,36793

0,1
Approximate Distinct Count,7
Approximate Unique (%),0.0%
Missing,73452
Missing (%),71.8%
Memory Size,2.0 MB

0,1
Mean,7.1503
Standard Deviation,1.4106
Median,7.0
Minimum,5.0
Maximum,13.0

0,1
1st row,heater
2nd row,shoofaj
3rd row,shoofaj
4th row,shoofaj
5th row,fireplace

0,1
Count,203427
Lowercase Letter,203427
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,2
Approximate Unique (%),0.0%
Missing,92078
Missing (%),90.0%
Memory Size,693.6 KB

0,1
Mean,4.0058
Standard Deviation,0.07613
Median,4.0
Minimum,4.0
Maximum,5.0

0,1
1st row,True
2nd row,True
3rd row,True
4th row,True
5th row,True

0,1
Count,41232
Lowercase Letter,30939
Space Separator,0
Uppercase Letter,10293
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,9
Approximate Unique (%),0.0%
Missing,68101
Missing (%),66.5%
Memory Size,2.2 MB

0,1
Mean,1.1144
Standard Deviation,1.0634
Median,1.0
Minimum,1.0
Maximum,11.0

0,1
1st row,3
2nd row,1
3rd row,4
4th row,2
5th row,2

0,1
Count,3136
Lowercase Letter,3136
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,34270

0,1
Approximate Distinct Count,3
Approximate Unique (%),0.0%
Missing,72452
Missing (%),70.8%
Memory Size,2.1 MB

0,1
Mean,8.5823
Standard Deviation,2.2273
Median,7.0
Minimum,7.0
Maximum,12.0

0,1
1st row,water_heater
2nd row,package
3rd row,package
4th row,package
5th row,package

0,1
Count,248643
Lowercase Letter,248643
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,2
Approximate Unique (%),0.0%
Missing,0
Missing (%),0.0%
Memory Size,11.0 MB

0,1
Mean,5.2977
Standard Deviation,0.4572
Median,5.0
Minimum,5.0
Maximum,6.0

0,1
1st row,توافقی
2nd row,مقطوع
3rd row,مقطوع
4th row,مقطوع
5th row,مقطوع

0,1
Count,0
Lowercase Letter,0
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,4139
Approximate Unique (%),5.8%
Missing,30475
Missing (%),29.8%
Infinite,0
Infinite (%),0.0%
Memory Size,1.1 MB
Mean,3.0007e+09
Minimum,200000
Maximum,5e+11

0,1
Minimum,200000.0
5-th Percentile,250000000.0
Q1,600000000.0
Median,1122000000.0
Q3,2290000000.0
95-th Percentile,8000000000.0
Maximum,500000000000.0
Range,500000000000.0
IQR,1690000000.0

0,1
Mean,3000700000.0
Standard Deviation,14545000000.0
Variance,2.1154e+20
Sum,215740000000000.0
Skewness,22.604
Kurtosis,646.8588
Coefficient of Variation,4.847

0,1
Approximate Distinct Count,4139
Approximate Unique (%),5.8%
Missing,30475
Missing (%),29.8%
Infinite,0
Infinite (%),0.0%
Memory Size,1.1 MB
Mean,3.0007e+09
Minimum,200000
Maximum,5e+11

0,1
Minimum,200000.0
5-th Percentile,250000000.0
Q1,600000000.0
Median,1122000000.0
Q3,2290000000.0
95-th Percentile,8000000000.0
Maximum,500000000000.0
Range,500000000000.0
IQR,1690000000.0

0,1
Mean,3000700000.0
Standard Deviation,14545000000.0
Variance,2.1154e+20
Sum,215740000000000.0
Skewness,22.604
Kurtosis,646.8588
Coefficient of Variation,4.847

0,1
Approximate Distinct Count,3
Approximate Unique (%),0.0%
Missing,68940
Missing (%),67.3%
Memory Size,2.3 MB

0,1
Mean,7.8929
Standard Deviation,2.4949
Median,10.0
Minimum,4.0
Maximum,10.0

0,1
1st row,squat_seat
2nd row,squat_seat
3rd row,squat
4th row,squat_seat
5th row,squat_seat

0,1
Count,244381
Lowercase Letter,244381
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,2
Approximate Unique (%),0.0%
Missing,96237
Missing (%),94.0%
Memory Size,417.2 KB

0,1
Mean,4.6506
Standard Deviation,0.4768
Median,5.0
Minimum,4.0
Maximum,5.0

0,1
1st row,True
2nd row,False
3rd row,False
4th row,True
5th row,False

0,1
Count,28527
Lowercase Letter,22393
Space Separator,0
Uppercase Letter,6134
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,4784
Approximate Unique (%),75.0%
Missing,95993
Missing (%),93.8%
Memory Size,595.5 KB

0,1
Mean,10
Standard Deviation,0
Median,10
Minimum,10
Maximum,10

0,1
1st row,0374567835
2nd row,۰۰۶۴۴۱۶۸۱۱
3rd row,2909811182
4th row,0013232312
5th row,2500118649

0,1
Count,0
Lowercase Letter,0
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,40304

0,1
Approximate Distinct Count,8
Approximate Unique (%),88.9%
Missing,102362
Missing (%),100.0%
Memory Size,21.7 KB

0,1
Mean,636.8889
Standard Deviation,270.3518
Median,837.0
Minimum,222.0
Maximum,930.0

0,1
1st row,به نام خدا آپار...
2nd row,سلام. یک واحدآپارت...
3rd row,بهترین فرصت برای ا...
4th row,سلام. یک واحدآپارت...
5th row,یکواحد آپارتمان دو...

0,1
Count,26
Lowercase Letter,18
Space Separator,1075
Uppercase Letter,8
Dash Punctuation,3
Decimal Number,63

0,1
Approximate Distinct Count,8
Approximate Unique (%),0.0%
Missing,0
Missing (%),0.0%
Memory Size,6.6 MB

0,1
Mean,2.7592
Standard Deviation,2.3872
Median,1.0
Minimum,1.0
Maximum,8.0

0,1
1st row,0
2nd row,153
3rd row,150
4th row,150
5th row,0

0,1
Count,0
Lowercase Letter,0
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,282457

0,1
Approximate Distinct Count,7
Approximate Unique (%),0.0%
Missing,0
Missing (%),0.0%
Memory Size,7.1 MB

0,1
Mean,7.9702
Standard Deviation,2.6571
Median,6.0
Minimum,6.0
Maximum,23.0

0,1
1st row,accept
2nd row,wrong_price
3rd row,one_on_one
4th row,one_on_one
5th row,accept

0,1
Count,743985
Lowercase Letter,743985
Space Separator,0
Uppercase Letter,0
Dash Punctuation,0
Decimal Number,0

0,1
Approximate Distinct Count,3
Approximate Unique (%),0.0%
Missing,0
Missing (%),0.0%
Memory Size,7.0 MB

0,1
Mean,6.2289
Standard Deviation,1.045
Median,6.0
Minimum,6.0
Maximum,11.0

0,1
1st row,accept
2nd row,reject
3rd row,reject
4th row,reject
5th row,accept

0,1
Count,632970
Lowercase Letter,632970
Space Separator,0
Uppercase Letter,0
Dash Punctuation,4686
Decimal Number,0


In [16]:
# observation based on EDA and Divar app: these coumns are 
# - optional in Divar app
# - more than 90% times missing (null)
# - don't correlate with ad acceptence
optional_columns = ['other_options_and_attributes.balcony', 'other_options_and_attributes.building_direction',
                      'other_options_and_attributes.cooling_system', 'other_options_and_attributes.deed_type',
                    'other_options_and_attributes.floor_type', 'other_options_and_attributes.floors_count',
                     'other_options_and_attributes.heating_system', 'other_options_and_attributes.rebuilt',
                     'other_options_and_attributes.unit_per_floor', 'other_options_and_attributes.warm_water_provider',
                    'other_options_and_attributes.toilet']
# so we drop all of them
train_df = train_df.drop(optional_columns, axis=1)
test_df = test_df.drop(optional_columns, axis=1)

In [17]:
# are price.value and new_price the same?
# observation: based on the correlation coefficient in EDA they seemed the same
new_price = train_df['new_price'].fillna(False).values
price_value = train_df['price.value'].fillna(False).values
print('new_price is price_value?', np.sum(new_price == price_value) == len(train_df))
# drop one of them (redundant)
train_df = train_df.drop('price.value', axis=1)
test_df = test_df.drop('price.value', axis=1)

new_price is price_value? True


In [18]:
# observation: most of the follwing varibles are missing +90% times and are bool not text or numbers 
# - people only have a limited chose, where none of them can effect acceptence
# - they have no correlation with ad acceptence

# national_id (10 digits): doesn't matter for ad acceptence +96% missiing
# - high cardinality, in fact unique for each person
train_df = train_df.drop('national_id', axis=1)
test_df = test_df.drop('national_id', axis=1)

# zoonkan_enabled (bool): it's 94% missing and optional
# - it seems it's for real state egnecies and doesn't have anything to do with accpetence
train_df = train_df.drop('zoonkan_enabled', axis=1)
test_df = test_df.drop('zoonkan_enabled', axis=1)

# desc (text): it's 99.9% missing and when they are present it's equal to description
df = train_df[['desc', 'description']].dropna() # 0.1% when desc isn't missing
train_df = train_df.drop('desc', axis=1)
test_df = test_df.drop('desc', axis=1)
df # to show they are the same

Unnamed: 0,desc,description
4436,به نام خدا \n\n\nآپارتمان ۱۰۶ متری دره بهشت پر...,به نام خدا \n\n\nآپارتمان ۱۰۶ متری دره بهشت پر...
33058,سلام.\nیک واحدآپارتمان ۸۵ متری واقع در اصفهان،...,سلام.\nیک واحدآپارتمان ۸۵ متری واقع در اصفهان،...
35144,بهترین فرصت برای اینکه در زیباترین شه...,بهترین فرصت برای اینکه در زیباترین شه...
56626,سلام.\nیک واحدآپارتمان ۸۵ متری واقع در اصفهان،...,سلام.\nیک واحدآپارتمان ۸۵ متری واقع در اصفهان،...
63122,یکواحد آپارتمان دوخوابه نوساز کلید نخورده واقع...,یکواحد آپارتمان دوخوابه نوساز کلید نخورده واقع...
75359,✅واحد فوق ۶۷متر ، طبقه ی چهارم ، در بهترین منط...,✅واحد فوق ۶۷متر ، طبقه ی چهارم ، در بهترین منط...
79629,بسم الله الرحمن الرحیم\n———————————-\n\n ☑️ا...,بسم الله الرحمن الرحیم\n———————————-\n\n ☑️ا...
82658,⚜️⚜️مشاورین املاک البرز⚜️⚜️\nبا سلام \n ...,⚜️⚜️مشاورین املاک البرز⚜️⚜️\nبا سلام \n ...
99615,واحد واسه خرید میخوای یا میخوای بفروشی؟ اجاره ...,واحد واسه خرید میخوای یا میخوای بفروشی؟ اجاره ...


In this section we will look at example for each reject_reason to find related features
- Which features are discriminative for which reason?
- I have checked more 100 examples for each reason

In [19]:
# buy_add (?%): people ask for house in title or description
train_df[train_df.reject_reason_category == 'buy_add'].drop(['reject_reason_id', 'review_label', 'reject_reason_category'], axis=1)
# observation: 

Unnamed: 0,description,elevator,floor,parking,rooms,size,title,user_type,warehouse,year,location.city,price.mode,new_price
22,باسلام\nخریدار یک واحد در گلشهر هستم تا حداکثر...,true,2,true,دو,75,خریدار آپارتمان در گلشهر,شخصی,false,۱۳۹۰,18,مقطوع,700000000.0
27,خریدارامتیاز آپارتمان روبروی جاده سلامت دسته ا...,false,0,true,دو,100,خریدار آپارتمان روبروی جاده سلامت,شخصی,true,۱۳۹۹,874,توافقی,
33,با سلام\nمتقاضی خرید یک واحد آپارتمان با شرایط...,true,1,true,دو,75,متقاضی خرید آپارتمان,مشاور املاک,true,۱۳۸۵,15,توافقی,
35,باسلام\nدقت فرمایید لطفا\n\nخریدار یک واحد مسک...,true,1,true,دو,105,پردیس فاز ۸,شخصی,true,۱۳۹۹,1,توافقی,
46,باسلام آپارتمان میخام ۹۰ متری۱۵۰میلیون نقد دار...,false,3,true,دو,90,آپارتمان,شخصی,false,۱۳۹۵,27,مقطوع,150000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
102289,سلام خریدارخانه سندار۵۰یاکمترهستم,false,0,true,یک,50,اپارتمان,شخصی,true,۱۳۹۱,14,توافقی,
102318,خریدارمهرگان,true,4,false,یک,60,مهرگان۲۵۰,شخصی,true,۱۳۹۰,873,توافقی,
102329,خریدار اپارتمان نقلی تمیز,true,1,true,یک,40,خرید اپارتمان نقلی,شخصی,false,۱۳۸۶,1,مقطوع,600000000.0
102339,خریدار واحد نو ساز در مهر سعداباد هستم لطفا فق...,true,4,true,دو,85,نیازمند اپارتمان در مهر سعداباد,شخصی,true,۱۳۹۹,21,توافقی,


In [20]:
# wrong_price
# - in the whole dataset new_price 29.8% is missing
test = train_df[train_df.reject_reason_category == 'wrong_price'].drop(['reject_reason_id', 'review_label', 'reject_reason_category'], axis=1)
print('% wrong_price comes with price.mode=fix', np.sum(test['price.mode'] == 'مقطوع') / len(test))
print('% wrong_price comes with not null new_price', len(test['new_price'].dropna()) / len(test))
test

% wrong_price comes with price.mode=fix 0.9980821299638989
% wrong_price comes with not null new_price 0.9980821299638989


Unnamed: 0,description,elevator,floor,parking,rooms,size,title,user_type,warehouse,year,location.city,price.mode,new_price
1,⭐⭐⭐بسم الله الرحمن الرحیم ⭐⭐⭐\n\n⚜املاک فردیس ...,true,2,true,دو,69,۶۹متر واحد قریشی شمالی طبقه دوم,مشاور املاک,true,۱۳۹۱,2,مقطوع,3.000000e+08
11,♦️♦️سازه ایی متفاوت ♦️♦️\n ♦️♦️♦️...,true,2,true,دو,110,110متری نوساز * فاز۴ مهرشهر*,مشاور املاک,true,۱۳۹۹,2,مقطوع,1.980000e+09
24,باسلام واحد 80 متری دوخوابه پنجره ها دوجداره و...,true,2,false,دو,80,آپارتمان 80 متری دوخوابه,شخصی,false,۱۳۹۹,1,مقطوع,4.000000e+08
28,فروش واحد بسیار عالی در دل شهر پردیسان بسیار ت...,true,5,false,یک,65,اپارتمان 65متری تک خواب,شخصی,false,۱۳۹۱,8,مقطوع,4.850000e+08
42,1/100 میلیارد \n60 متر \nطبقه3\n2 خواب\n5 طبقه...,true,5,true,دو,60,60متر 2خواب فول امکانات10 ساله,مشاور املاک,false,۱۳۸۹,1,مقطوع,1.200000e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...
102347,⭐املاک مدیران مرکز فروش اپارتمان های وام دار⭐\...,false,2,true,دو,75,اپارتمان ۷۵ متر دو خواب,مشاور املاک,true,۱۳۹۱,2,مقطوع,2.900000e+08
102348,واحد بروز دوخوابه بزرگ\nدارای سند تکبرگ\nمیدان...,true,1,false,دو,60,واحد 60 متری میدان فرهنگ،خیابان آزادگان،کوچه حاتم,مشاور املاک,true,۱۳۸۶,12,مقطوع,6.200000e+08
102362,واحد و مشاعات فول بازسازی \nبسیار تمیز \nمناسب...,false,3,true,یک,40,آپارتمان،۴۰ متری،فول بازسازی,مشاور املاک,false,۱۳۸۳,1,مقطوع,3.300000e+08
102368,⭐⭐⭐⭐بسم الله الرحمن الرحیم⭐⭐⭐⭐ \n⚜⚜املاک فردیس...,false,1,true,دو,63,آپارتمان ۶۳متری با تسهیلات بانکی,مشاور املاک,true,۱۳۸۹,2,مقطوع,3.100000e+08


In [21]:
# location.city:  needs another check ??? 
train_df[train_df.reject_reason_category == 'wrong_city'].drop(['reject_reason_id', 'review_label', 'reject_reason_category'], axis=1)
# the most common location.city is 1 (it seems it's Tehran)
# things that I observed might be relevent
# test[['description', 'title', 'location.city']].iloc[:10]

Unnamed: 0,description,elevator,floor,parking,rooms,size,title,user_type,warehouse,year,location.city,price.mode,new_price
7,این ملک در گیلان در شهر کلاچای میباشد فاصله تا...,false,1,true,دو,73,اپارتمان ۷۳متری گیلان کلاچای,شخصی,true,۱۳۹۴,1,مقطوع,1.022000e+09
16,اپارتمان 81 متری دو خواب مسکن مهر مهر اباد پار...,false,2,true,دو,89,در منطقه خوش اب و هوایی مهر اباد,شخصی,false,۱۳۹۷,1,مقطوع,6.000000e+08
17,خانه فوق در شهریار تهران می باشد و ۲۰ دقیقه با...,true,2,false,دو,71,آپارتمان ۷۱ متری معاوضه با زمین,شخصی,true,۱۳۹۴,709,مقطوع,5.100000e+08
26,یک واحد آپارتمان ٨٢ متری سازه بتنی کف سرامیک د...,,,,دو,82,آپارتمان پرند تهران,شخصی,,۱۳۹۷,9,توافقی,
31,سند تک برگ طبقه چهارم ۱۲واحد\n#با۵۳۰میلیون صاح...,false,4,true,دو,76,آپارتمان ۷۶مترژ واقع در کرج,شخصی,true,۱۳۸۲,1,مقطوع,8.300000e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...
102309,⛔مسکن مهر را وکالتی نخرید!!!⛔\n⁦☑️⁩ما واحد را ...,true,0,true,دو,78,۷۸متر،مهراباد رودهن فول .,شخصی,true,۱۳۹۸,1,مقطوع,5.850000e+08
102319,مژده⭕ مژده⭕ ...,false,2,false,یک,52,آپارتمان ۵۲ متری اسلامشهر,شخصی,true,۱۳۹۰,1,مقطوع,4.400000e+08
102324,⚜️املاک لیدوما⚜️\n\n☆آپارتمان: شیراز بلوار طلا...,true,2,true,یک,80,آپارتمان یک خواب شیراز,مشاور املاک,false,۱۳۹۷,780,توافقی,
102336,یک واحد آپارتمان واقع در شهرک مخابرات شهر اردب...,false,0,true,دو,86,معاوضه یک آپارتمان در اردبیل با مغازه در تهران,شخصی,true,۱۳۹۲,1,مقطوع,6.500000e+08


In [22]:
# wrong_category: 
test = train_df[train_df.reject_reason_category == 'wrong_category'].drop(['reject_reason_id', 'review_label', 'reject_reason_category'], axis=1)
# things that I observed might be relevent
# it seems people write description and titles that are irrelvent to 'apartman-sell'
# test[['description', 'title', 'new_price']] 
test

Unnamed: 0,description,elevator,floor,parking,rooms,size,title,user_type,warehouse,year,location.city,price.mode,new_price
130,فروش ویلا ۲۸۰متری بازسازی شده در بازکیاگوراب ل...,false,0,true,دو,280,فروش ویلا 280متری بازسازی شده در ایشکا بازکیاگ...,مشاور املاک,true,۱۳۸۵,746,مقطوع,1.000000e+09
133,عرض ادب و احترام \n\nدر صورت پاسخگو نبودن خط ا...,false,2,true,سه,200,200 متر ویلایی دوبلکس سرافرازان,مشاور املاک,true,۱۳۸۲,3,مقطوع,3.500000e+09
177,به نام خدا\nواحد۱۰۰متر مفید ویلایی،۲طبقه انتها...,false,1,true,دو,100,مجتمع نخل,مشاور املاک,false,۱۳۹۹,18,مقطوع,8.800000e+08
272,اپارتمان ویلایی دو طبقه در بهترین منطقه فازچها...,false,2,true,دو,95,اپارتمان ویلایی دوطبقه فاز۴ پردیس,مشاور املاک,true,۱۳۹۰,1,مقطوع,1.200000e+09
394,معاوضه تراکتور ۶سلنددبل مدل ۹۶ با اد‌وات با آپ...,true,2,true,دو,80,معاوضه تراکتور ۶سلند دبل با آپارتمان,شخصی,true,۱۳۹۵,27,مقطوع,6.500000e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...
102163,طبقه اول آب برق جدا گاز مشترک\n20میلیون ماهی ی...,false,1,false,دو,140,رحیم اباد رهن و اجاره واحد140متری,مشاور املاک,false,قبل از ۱۳۷۰,826,مقطوع,7.000000e+07
102220,با سلام و وقت بخیر \nاینجانب تازه ازدواج کردم ...,true,1,true,دو,75,معاوضه ماشین و وجه با آپارتمان در سهند,شخصی,true,۱۳۹۲,761,مقطوع,1.800000e+08
102334,دارای کمد دیواری،هود،پمپ آب، کابینت خونه کاملا...,true,1,true,دو,75,خانه 75متری مسکن مهر.طبقه اول,شخصی,true,۱۳۹۵,24,مقطوع,3.500000e+07
102337,فقط و فقط چت دیوار پاسخ می دم.\nاملاکی ها تماس...,false,0,true,دو,91,۹۱متری.زیتون۱.همکف.تکواحدی.نیمه ویلایی,شخصی,true,۱۳۹۴,8,مقطوع,7.900000e+08


In [23]:
# critics_and_suggestions: 
train_df[train_df.reject_reason_category == 'critics_and_suggestions'].drop(['reject_reason_id', 'review_label', 'reject_reason_category'], axis=1)

Unnamed: 0,description,elevator,floor,parking,rooms,size,title,user_type,warehouse,year,location.city,price.mode,new_price
371,زینیننی\nینینیجضکویمیث\nنییم,,,,پنج یا بیشتر,25,واپان,شخصی,,۱۳۷۳,29,توافقی,
668,ظططظطظطظطظطظ,true,-1,true,بدون اتاق,1,ظطظط,شخصی,true,۱۳۹۹,6,توافقی,
678,تووروودرلانکوو,false,1,false,دو,588088,اددررلردوتککدزلاک باتکو,شخصی,false,۱۳۹۶,4,توافقی,
724,,false,1,true,یک,111,1111111111111111111111,شخصی,false,۱۳۸۰,663,مقطوع,1.000000e+08
839,نباتنمیسبانمسلانمسلتمپ,true,5,true,سه,100,تلییلاتتل,شخصی,true,۱۳۹۶,20,مقطوع,1.000000e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...
101887,نلزمنقسدمپوبثخپو سبونهز,true,3,true,سه,130,جمبرنچگوقطوگلسدمپ,شخصی,true,۱۳۹۷,7,مقطوع,1.300000e+11
102153,عادییثبلابیل زبلا,true,1,true,سه,1,قباافقق۳,شخصی,false,۱۳۹۵,7,مقطوع,5.898795e+06
102168,لطفا تایید نفرمایید \nتکراری است\nبا تشکر از د...,true,5,true,سه,251,251 متر با 90 متر روف گاردن اختصاصی,مشاور املاک,true,۱۳۹۹,3,توافقی,
102260,غاذددذ,false,3,true,بدون اتاق,2,ذرل,شخصی,true,۱۳۹۷,8,توافقی,


In [24]:
# one_on_one: 
test = train_df[train_df.reject_reason_category == 'one_on_one'].drop(['reject_reason_id', 'review_label', 'reject_reason_category'], axis=1)
# things that I observed might be relevent
test[['description', 'title', 'new_price']].iloc[:10]
test

Unnamed: 0,description,elevator,floor,parking,rooms,size,title,user_type,warehouse,year,location.city,price.mode,new_price
2,با سلام \nتعدادی واحد در متراژ های ۹۰ و ۱۲۰ مت...,true,5,true,دو,120,آپارتمان ۱۲۰ متری مسکن خیابان مطهری,شخصی,true,۱۳۹۹,9,مقطوع,1.000000e+09
3,پیش فروش ۲ واحد آپارتمان تک واحدی \n\n۳ خواب \...,true,2,true,سه,135,۱۳۵ متر پیش فروش فول امکانات تک واحدی,مشاور املاک,true,۱۳۹۹,2,مقطوع,2.497500e+09
8,٥طبقه تک واحدی ساختمان بهشت ساخت سازنده بنام م...,true,3,true,سه,165,تک واحدی دوکله فول,شخصی,true,۱۳۹۹,1,مقطوع,1.100000e+10
9,بدون واسطه ... باهمکار کار نمی کنیم\n180 متر ...,true,1,true,سه,180,اپارتمان ۱۰۰ الی ۴۵۰متری.ولنجک,مشاور املاک,true,۱۳۹۹,1,توافقی,
23,بهترین نقطه شهر دو واحد ۸۰ و ۶۰ متری شخصی,false,2,true,چهار,80,اپارتمان ۸۰ و ۶۰ متری,شخصی,true,۱۳۸۸,665,مقطوع,1.350000e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...
102357,تماس فقط زارع\n\n\nفقط متری ۱۱/۲۰۰\n۱۲۵ متر \...,true,3,true,سه,125,۱۲۵ متری صفر اقدسیه ۵۳,مشاور املاک,true,۱۳۹۹,3,مقطوع,1.400000e+09
102360,با سلام\nاطلاعیه آگهی فروش آپارتمانهای 125 و 1...,true,2,true,دو,140,فروش آپارتمان فول کلید نخورده شهرک ارم,مشاور املاک,true,۱۳۹۹,2,مقطوع,8.400000e+08
102361,(( عمارتی فاخر از سازنده معتبر منطقه ))\n\nمدر...,true,8,true,یک,60,برج مجلل کلاسیک رویال/ مشاعات کامل,مشاور املاک,true,۱۳۹۹,1,توافقی,
102364,✅۵طبقه ۹ واحد ( دارای نسق معتبر)\n✅طبقات موجو...,true,2,true,دو,100,پیش فروش واحد۱۰۰متری نسق خیابان ارشاد,مشاور املاک,true,۱۳۹۹,12,مقطوع,1.100000e+09


In [25]:
# As I have seen: 
# iloc 95499 9540, 42363 53060

We will correct a few ordinal and binary variables (while preserving nan values)
- convert ordinals from string to number (rooms, floor, year)
- convert binaries from string to 0 and 1 (evevator, parking, warehouse)
  - we will address nans later on
- what to do with numericals ???

In [26]:
# year (ordinal) : 0.1% missing
# convert year to int less than 1370 to 1370 
# - it preserces nan
def year_convert(text):
    if pd.isna(text) == False:
        try: return int(text)
        except: return 1370
    return text

train_df.year = train_df.year.apply(year_convert)
test_df.year = test_df.year.apply(year_convert)

In [27]:
# floor (ordinal) : 3.9% missing
def floor_convert(text):
    if pd.isna(text) == False:
        try: return int(text) + 1 
        except: return 31
    return text

train_df.floor = train_df.floor.apply(floor_convert)
test_df.floor = test_df.floor.apply(floor_convert)

In [28]:
# rooms (ordinal) : 0.0% missing
# convert year to int
def room_convert(text):
    if text == 'بدون اتاق': return 0
    if text == 'یک': return 1
    if text == 'دو': return 2
    if text == 'سه': return 3
    if text == 'چهار': return 4
    return 5

train_df.rooms = train_df.rooms.apply(room_convert)
test_df.rooms = test_df.rooms.apply(room_convert)

In [29]:
# convert binary variables to (0, 1)
def label_encoding(series):
    return pd.Series(LabelEncoder().fit_transform(series[series.notnull()]), index=series[series.notnull()].index)

train_df[['warehouse', 'elevator', 'parking']] = train_df[['warehouse', 'elevator', 'parking']].apply(label_encoding)
test_df[['warehouse', 'elevator', 'parking']] = test_df[['warehouse', 'elevator', 'parking']].apply(label_encoding)

In [30]:
train_df # let's check the output

Unnamed: 0,description,elevator,floor,parking,rooms,size,title,user_type,warehouse,year,location.city,price.mode,new_price,reject_reason_id,reject_reason_category,review_label
0,سلام\nواحد تک خواب تمیز\nسند تک برگ آماده وام ...,1.0,1.0,1.0,1,60,آپارتمان نقلی، سند ششدانگ,شخصی,1.0,1388.0,8,توافقی,,0,accept,accept
1,⭐⭐⭐بسم الله الرحمن الرحیم ⭐⭐⭐\n\n⚜املاک فردیس ...,1.0,3.0,1.0,2,69,۶۹متر واحد قریشی شمالی طبقه دوم,مشاور املاک,1.0,1391.0,2,مقطوع,3.000000e+08,153,wrong_price,reject
2,با سلام \nتعدادی واحد در متراژ های ۹۰ و ۱۲۰ مت...,1.0,6.0,1.0,2,120,آپارتمان ۱۲۰ متری مسکن خیابان مطهری,شخصی,1.0,1399.0,9,مقطوع,1.000000e+09,150,one_on_one,reject
3,پیش فروش ۲ واحد آپارتمان تک واحدی \n\n۳ خواب \...,1.0,3.0,1.0,3,135,۱۳۵ متر پیش فروش فول امکانات تک واحدی,مشاور املاک,1.0,1399.0,2,مقطوع,2.497500e+09,150,one_on_one,reject
4,فروش فوری آپارتمان در گلسار بلوار سمیه \n\n⭕️...,1.0,6.0,1.0,2,92,فروش آپارتمان گلسار بلوار سمیه ٩٢ متری,مشاور املاک,1.0,1392.0,12,مقطوع,1.900000e+09,0,accept,accept
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102366,✅به نام خدا✔️\n✅املاک (آ ب ث )✔️\n✅لذ...,0.0,3.0,1.0,2,80,اپارتمان ۸۰ متر دو خوابه,مشاور املاک,1.0,1397.0,2,مقطوع,2.990000e+08,0,accept,accept
102367,❌شماره اگهی خاموش میباشد❌\n لطفا با شما...,1.0,3.0,1.0,3,148,*فروش آپارتمان ۱۴۸متری فاز۴مهرشهر*,مشاور املاک,1.0,1399.0,2,توافقی,,0,accept,accept
102368,⭐⭐⭐⭐بسم الله الرحمن الرحیم⭐⭐⭐⭐ \n⚜⚜املاک فردیس...,0.0,2.0,1.0,2,63,آپارتمان ۶۳متری با تسهیلات بانکی,مشاور املاک,1.0,1389.0,2,مقطوع,3.100000e+08,153,wrong_price,reject
102369,سه واحد ۸۵ متری و۱۰۰ متر تجاری درچهارباب مغازه...,0.0,3.0,1.0,5,240,مساحت کلی ۲۴۰ متر بصورت چهارواحدی,شخصی,0.0,1387.0,7,توافقی,,150,one_on_one,reject


Text Fiels: Observe, EDA, Clean
- I have repeated the following process and changed the cleaning script to reach good results

In [31]:
# this cleaning code is based on my past experince with Persian text
# - plus seening more than 500 examples from this dataset 
# - bert-based models and BPETokenizers handle the rest for us :)
#   - we don't need lematization, removing stop words , etc.

wierd_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"  # emoticons
                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            u"\U00002702-\U000027B0"
                            u"\U000024C2-\U0001F251"
                            u"\U0001f926-\U0001f937"
                            u'\U00010000-\U0010ffff'
                            u"\u200d"
                            u"\u2640-\u2642"
                            u"\u2600-\u2B55"
                            u"\u23cf"
                            u"\u23e9"
                            u"\u231a"
                            u"\u3030"
                            u"\ufe0f"
                            u"\u2069"
                            u"\u2066"
                            u"\u2068"
                            u"\u2067"
                            u"\u200c"  # half space
                            "]+", flags=re.UNICODE)

def text_cleaner(x):
    x = clean(x,
              fix_unicode=True,
              to_ascii=False,
              lower=True,
              no_line_breaks=True,
              no_urls=True,
              no_emails=True,
              no_phone_numbers=True,
              no_emoji=True,
              no_numbers=False,
              no_digits=False,
              no_currency_symbols=True,
              no_punct=False,
              replace_with_url=" ",
              replace_with_email=" ایمیل ",
              replace_with_phone_number=" شماره تماس ",
              replace_with_currency_symbol=" ")

    x = wierd_pattern.sub(r'', x)
    x = x.replace('\n', ' ، ')
    # char+3 -> char
    x = re.sub(r'(.)\1{3,}', r'\1', x)
    # add spcae before and after numbers
    x = re.sub(r"([0-9]+(\.[0-9]+)?)",r" \1 ", x)
    x = re.sub(r"([۰-۹]+(\.[۰-۹]+)?)",r" \1 ", x)
    # based on observation
    x = re.sub('_', r' ، ', x) 
    x = re.sub('-', r' ، ', x) 
    x = re.sub('/', ' ، ', x)
    x = re.sub('…', ' . ', x)
    x = re.sub('\+', ' ، ', x)
    x = re.sub('—', ' ، ', x)
    x = re.sub('#', ' ، ', x)
    x = re.sub('=', ' = ', x)
    x = re.sub('@', ' @ ', x)
    x = re.sub('•', ' • ', x)
    x = re.sub("\s+", " ", x)
    x = x.replace('*', ' ، ')
    return x.strip()

tokenizer = WordTokenizer(join_verb_parts=False)
normalizer = Normalizer(remove_extra_spaces=True, persian_numbers=False, persian_style=True,
                        punctuation_spacing=True, remove_diacritics=True, token_based=True)

def text_preprocessor(t):
    tokens = tokenizer.tokenize(text_cleaner(normalizer.normalize(t)))
    return ' '.join(tokens) # return to string

In [32]:
# apply cleaning to text fields : it might take a few mintues
train_df.title = train_df.title.apply(text_preprocessor)
train_df.description = train_df.description.apply(text_preprocessor)
print('trainset done!')
test_df.title = test_df.title.apply(text_preprocessor)
test_df.description = test_df.description.apply(text_preprocessor)
print('testset done!')

trainset done!
testset done!


In [33]:
# compute word len in each title
words_lens = train_df.title.apply(lambda t: len(tokenizer.tokenize(t)))
fig = go.Figure()
fig.add_trace(go.Histogram(x=words_lens))
fig.update_layout(
    title_text='Distribution of word counts within titles',
    xaxis_title_text='Word Count',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)
fig.show()

In [34]:
# compute word len in each description
words_lens = train_df.description.apply(lambda t: len(tokenizer.tokenize(t)))
fig = go.Figure()
fig.add_trace(go.Histogram(x=words_lens))
fig.update_layout(
    title_text='Distribution of word counts within description',
    xaxis_title_text='Word Count',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)
fig.show()

In [35]:
# extracting unigrams in description 
# - we could explore bigrams as well
word_list = []
for desc in tqdm(train_df.description):
    word_list.extend(tokenizer.tokenize(desc))
words = pd.DataFrame(word_list, columns=['words'])
words = words.groupby(['words']).size()

100%|██████████| 102371/102371 [00:05<00:00, 18025.96it/s]


In [36]:
# show 200 most frequent words in description
top = words.sort_values(ascending=False)
pd.DataFrame(top.index.to_numpy()[:200].reshape(-1, 20))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,،,و,.,با,در,:,به,طبقه,واحد,متر,از,پارکینگ,تماس,متری,),(,املاک,قیمت,خواب,عالی
1,فروش,میلیون,دو,شما,منطقه,دارای,یک,آپارتمان,انباری,تا,امکانات,برای,بهترین,میباشد,بزرگ,۲,کف,مشاور,نقشه,فول
2,بدون,سند,آسانسور,خرید,ملک,فقط,نور,کابینت,دیواری,خیابان,واحدی,سرامیک,تک,خوش,وام,رهن,۳,بسیار,زیر,سالن
3,جهت,شده,بگیرید,دسترسی,بازدید,ساختمان,ما,اول,لطفا,کمد,شماره,مسکن,۴,است,2,تراس,که,گاز,سرمایه,هر
4,پکیج,درب,ساخت,مناسب,موجود,دارد,اختصاصی,را,آدرس,موقعیت,این,نما,سنگ,هود,بالکن,۵,گذاری,سه,3,سال
5,شیک,۱,کامل,سلام,هم,واقعی,فوری,بیشتر,تحویل,رو,واقع,متراژ,نورگیر,کوچه,کل,معاوضه,اطلاعات,تومان,اپارتمان,متریال
6,لوکیشن,=,لابی,مشاعات,دفتر,یا,دوم,جنوبی,مستر,نوساز,خریدار,خوابه,مبلغ,دی,تمام,سرویس,مشابه,بلوار,قابلیت,4
7,اف,تخفیف,کلید,زمین,1,میشود,پرده,»,سقف,اسانسور,کاغذ,کارشناس,نبش,شهر,شمالی,۶,برگ,5,سوم,صورت
8,مشاورین,کد,سندی,قابل,شهرک,فوقالعاده,طبقات,ویو,برج,حیاط,شخصی,«,فروشنده,فاز,خور,دار,۱۰۰,برند,لوکس,دیوار
9,پرتی,خانه,•,درجه,کنید,آشپزخانه,سیستم,م,کولر,ملکی,خانم,تعداد,مجتمع,اصلی,فرنگی,ضد,آماده,٫,واحدهای,تمیز


In [37]:
# 100 least frequent words: looks good
down = words.sort_values(ascending=True)
pd.DataFrame(down.index.to_numpy()[:100].reshape(-1, 10))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,ششصدوسیمیلیون,خریدند,میرسددوواحدهمکف,خریدمیتونیدهماهنگ,خریدمیتونیدمشارکت,خریدمیتونه,خریدمورد,میرسدسه,خریدمنشستم,میرسدضمنامعاوضه
1,خریدملکی,میرسدفاصله,خریدمل,خریدمطمین,خریدمطمعن,میرسدفعلادرمرحله,میرسدفقط,خریدمستقیمازخودسازنده,میرسدلطفا,خریدمستاجرخودش
2,خریدماندگار,میرسدمبلغ,خریدلوکس,خریدلرانی,خریدقطعی,خریدفوق,میرسدمتراژکل,خریدفروشسرمایه,میرسدمحله,میرسدمفید
3,میرسددرصورت,خریدطبقه,میرسدخواهشن,میرسدجهت,خریدوکسب,خریدومراکزتفریحی,خریدوفروشی,خریدوفروشتان,"میرسد,به",خریدوسودآورشهریارفاز
4,میرسداطلاعات,خریدورهن,خریدودرمانی,خریدوتفکیک,خریدوتفریحی,خریدوتردد,خریدوبازدیدهماهنگی,میرسداملاک,خریدوبازدیدتماس,خریدواسطه
5,میرسداین,میرسدبانقشه,میرسدباهرگونه,میرسدبرای,میرسدبه,میرسدتا,خریدهمرابادسته,خریدهستم,خریدهایم,میرسدتخفیف
6,خریدهاو,خریدهاراانجام,میرسدتوجه,خریدنمایید,میرزنده,خریدصددرصد,میرسدمنزل,خریدخلیج,میرسونیم,خریدتونو
7,میرسی,خریدتو,خریدتهران,خریدتر,خریدتخفیفهای,میرشد,میرشعبانی,خریدتاششصدملیون,خریدبی,خریدبهترینهادرکف
8,میرشکار,خریدبدونواسطه,میرصانع,خریدباصاحب,خریدبادوربین,خریدباتک,میرصد,خریدایده,میرضایی,خریدانشعاب
9,خریدانجام,خریداقدام,خریداستثنائی,خریداسان,میرطاهری,خریدازدفترتعاونی,میرسعید,خریدشیرالات,میرسدیه,میرسدیاعلی


What does it means when label is 'edit-accept'?

In [38]:
# reject_reason_id and reject_reason_category show that edit-accept has been accepted
# - they look like accept, but some of them have severe problems ???
edit_train = train_df[train_df.review_label == 'edit-accept']
edit_test = test_df[train_df.review_label == 'edit-accept']

print('label is edit-accept in train set %', len(edit_train) * 100 / len(train_df))
print('label is edit-accept in test set %', len(edit_test) * 100 / len(test_df))
print('are all edit-accepts accepted?', len(edit_train[edit_train.reject_reason_id == 0]) == len(edit_train))
# let's display meaningful features
edit_train[['description', 'title', 'size', 'rooms', 'location.city', 'new_price', 'review_label']] 
# observation: I don't understand the logic behind this class (it's only 4% of data)

label is edit-accept in train set % 4.577468228306845
label is edit-accept in test set % 4.650449086482395
are all edit-accepts accepted? True



Boolean Series key will be reindexed to match DataFrame index.



Unnamed: 0,description,title,size,rooms,location.city,new_price,review_label
12,خانه همان جایی است که آرامش روز شب را در کنار ...,آپارتمان 155 متری 3 خواب خوش نقشه سعادتآباد,155,3,1,1.035000e+10,edit-accept
20,لاهیجان ، خیابان خرمشهر ۱۰۰ متر ۲ خوابه طبقه س...,فروش آپارتمان ۱۰۰ متری نوساز,100,2,746,1.850000e+09,edit-accept
41,نقشه عالی دارای نورگیر ۵ متری قابلاستفاده,شیشه بری,51,1,29,,edit-accept
58,سقف طبقه دوم سقف بتن ستونا ریخته شده ادرس حمید...,فروش سقف خونه,70,2,12,1.200000e+08,edit-accept
67,مسکن مهر . بلوک 45 . تمیز و رنگآمیزی . . فقط ع...,پیک مهر,86,2,660,4.800000e+08,edit-accept
...,...,...,...,...,...,...,...
102228,آپارتمان بسیار لوکس در مرحله پایان سفتکاری با ...,پیشفروش آپارتمان بر ابنسینا,145,2,707,2.100000e+09,edit-accept
102244,آپارتمان خشک ، همکف ، در حال تکمیل ، دارای موق...,آپارتمان 104 متر مربع ، چشمه سفید ، پروژه 400 ...,104,2,9,5.500000e+08,edit-accept
102250,مشاورین املاک پرسپولیس پیامک پاسخگونیستم ساعت ...,گرگانجدید 13,55,1,21,,edit-accept
102262,نقدی دسترسی آسان,۳ عدد کولر گازی,94,2,1,,edit-accept


Duplicates: In structured data, Duplicate inputs result in some distribution across your output and thus you need to retain that distribution.  \\
In this case removing examples is highly destructive and must be avoided.
- Based on **iid** assumption, duplicate tells you that this particular value has a higher probability.



In [39]:
# thus I don't remove duplicates neither in train nor in test
# test = train_df.drop(['review_label', 'reject_reason_id'], axis=1)
# # dataset has many duplicates basedon on description and title
# # dataset has some annotation problems where 
# test[test.isin(test[test.duplicated()])].sort_values(["description", 'title']).dropna(thresh=1)

Dataset Prepration

In [40]:
# Now we can remove columns related to them reject reasons (they are supervision)
train_df = train_df.drop(['reject_reason_id', 'reject_reason_category'], axis=1)
test_df = test_df.drop(['reject_reason_id', 'reject_reason_category'], axis=1)

In [41]:
# convert labels to {'reject': 0, 'accept': 1, 'edit-accept': 2}
# - we consider 1 as accept (positive class) for considering recall as an important metric
def label2number(label):
    if label == 'reject': return 0
    if label == 'accept': return 1
    return 2

train_df.review_label = train_df.review_label.apply(label2number)
test_df.review_label = test_df.review_label.apply(label2number)

In [42]:
# we don't deal with nan values yet. Because some of our models can benefit from them
# - we'll address them when it's necessary

# We need a unique validation dataset accross all models
train_df, valid_df = train_test_split(train_df, test_size=0.1, 
                                      random_state=0, stratify=train_df.review_label)

# save and download
train_df.to_csv('train.csv')
valid_df.to_csv('valid.csv')
test_df.to_csv('test.csv')
# save test as well

In [43]:
# add dataset to my drive
!cp /content/train.csv  /content/drive/MyDrive
!cp /content/valid.csv  /content/drive/MyDrive
!cp /content/test.csv  /content/drive/MyDrive

After doing one round of EDA and dataset prepration I've defined my **metrics**
- It doesn't matter if we accept a few bad ones, but never reject correct adds
- so, we assinged 1 to accept and look for models with high **recall**

In [44]:
# add f1_score later if you considerd edit-accept seperatly
from sklearn.metrics import recall_score, precision_score, roc_auc_score
 
def metric_report(proba, labels):
    preds = np.argmax(proba, axis=1)
    return {'roc_auc': roc_auc_score(labels, proba, multi_class='ovr', average='macro'),
            'recall': recall_score(labels, preds),
            'percision': precision_score(labels, preds)}

In [46]:
train_df

Unnamed: 0,description,elevator,floor,parking,rooms,size,title,user_type,warehouse,year,location.city,price.mode,new_price,review_label
88238,خریدار واحد آپارتمان پرند فاز یک از پروژه پرنی...,0.0,2.0,0.0,1,70,خرید آپارتمان در پرند فازیک,شخصی,1.0,1390.0,1,توافقی,,0
86403,یا حق ۹۸ متر دو خواب پنج طبقه چهار واحدی طبقه ...,1.0,2.0,1.0,2,98,۹۸ متر ، فول امکانات ، صادقیه,مشاور املاک,1.0,1381.0,1,مقطوع,3.000000e+09,1
8460,٥١ متر یکخوابه ، طبقه سوم رویتى دوم سندى ، رو ...,0.0,4.0,0.0,1,51,٥١ متر رو به أفتاب شرق بلوار ١٣٥,مشاور املاک,1.0,1383.0,1,مقطوع,9.800000e+08,1
42268,اپارتمان دو کله جنوبی و شمالی نور گیری از هر د...,1.0,3.0,1.0,2,116,اپارتمان پاستور جدید جدیری کوچه شمس,شخصی,0.0,1397.0,5,مقطوع,2.000000e+09,1
27227,سلام آپارتمان ۸۷ متری متری ۱۱ میلیون ۵۰۰ درفوم...,1.0,8.0,0.0,2,87,آپارتمان ۸۷ متری فومن,شخصی,0.0,1389.0,12,مقطوع,1.000500e+09,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72375,، بهترین رویای شما ، ۶۵ متر ، ۲ خواب ، طبقه ۳ ...,0.0,4.0,1.0,2,65,۶۵ متر استادمعین ۲ خواب پارکینگ دار,مشاور املاک,1.0,1391.0,1,مقطوع,1.100000e+09,1
18513,نگینی دیگر از دپارتمان گپ ( بدون مشابه ) ۵۴ مت...,0.0,3.0,0.0,1,53,53 متر بدون مشابه سرمایه گذاری امام محمد باقر,مشاور املاک,1.0,1389.0,1,مقطوع,8.200000e+08,1
98600,ساختمان دارای نورگیری از دو طرف میباشد دارای د...,0.0,2.0,0.0,1,72,آپارتمان ، 72 متر ، هفده شهریور جدید,شخصی,0.0,1385.0,5,توافقی,,0
63922,سلام یک واحد آپارتمان 85 متری دو خواب طبقه سوم...,1.0,4.0,1.0,2,85,85 متر فول شفا هدایت هنرور صفر,مشاور املاک,1.0,1399.0,3,مقطوع,1.500000e+09,0
