<a href="https://colab.research.google.com/github/sindbad771/developer-roadmap/blob/master/CIAN_Cleaner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#0. Install libraries
!pip install -U git+https://github.com/qubvel/efficientnet

Collecting git+https://github.com/qubvel/efficientnet
  Cloning https://github.com/qubvel/efficientnet to /tmp/pip-req-build-97lzhk_n
  Running command git clone -q https://github.com/qubvel/efficientnet /tmp/pip-req-build-97lzhk_n
Building wheels for collected packages: efficientnet
  Building wheel for efficientnet (setup.py) ... [?25l[?25hdone
  Created wheel for efficientnet: filename=efficientnet-1.1.0-cp36-none-any.whl size=18327 sha256=41b9c4ad91bcde051c89e86cef794f9f30590390ff87c7d29013df0fc3b0ceb2
  Stored in directory: /tmp/pip-ephem-wheel-cache-wblao7go/wheels/64/60/2e/30ebaa76ed1626e86bfb0cc0579b737fdb7d9ff8cb9522663a
Successfully built efficientnet
Installing collected packages: efficientnet
  Found existing installation: efficientnet 1.1.0
    Uninstalling efficientnet-1.1.0:
      Successfully uninstalled efficientnet-1.1.0
Successfully installed efficientnet-1.1.0


In [None]:
#1. Loading BeautifulSoup and test request
from bs4 import BeautifulSoup
import requests


def load_flat_dicts(url):
    cian_html = requests.get(url).text
    soup = BeautifulSoup(cian_html)


    flats_dict = {}
    flats = soup.find_all('div', {'data-name': 'TopOfferCard'})
    flats += soup.find_all('div', {'data-name': 'OfferCard'})
    for f in flats:
        flat_imgs = []

        additional_imgs = f.find_all('img', {'data-name': 'GalleryImage'})
        flat_imgs.append(f.find('img')['src'])

        for fa in additional_imgs:
            flat_imgs.append(fa['src'])

        links = f.find_all('a', {'target': '_blank'})
        for a in links:
            if 'https://www.cian.ru/rent/flat/' or 'https://www.cian.ru/sale/flat/' in a['href']:
                if '/cat.php?' not in a['href']:
                  flats_dict[a['href']] = flat_imgs
                  break



    return flats_dict
flats_dict = load_flat_dicts('https://www.cian.ru/cat.php?currency=2&deal_type=rent&engine_version=2&maxprice=25000&minprice=15000&offer_type=flat&region=1&room1=1&room2=1&type=4')
print('Testing done!')

Testing done!


In [None]:
%%time
#2. Building model and predict one image

from tensorflow.keras.models import load_model, Model
from tensorflow.keras.utils import get_file
from tensorflow.keras.layers import GlobalAveragePooling2D, BatchNormalization, Activation, Dense, Dropout
from tensorflow.keras.preprocessing import image
import cv2
import numpy as np
import efficientnet.tfkeras as efn
import io
from urllib.request import urlopen

#0.96
def build_model_efficientnet():
    pretrained_model = efn.EfficientNetB0(weights='imagenet', include_top=False)
    pretrained_model.trainable = False
    x = pretrained_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(512, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.5)(x)
    predictions = Dense(2, activation='softmax')(x)
    model = Model(inputs=pretrained_model.input, outputs=predictions)
    
    # lr=1e-4
    model.compile(loss='categorical_crossentropy',
              optimizer='adam', 
              metrics=['accuracy'])
    model.summary()
    return model


### LOADING MODEL
model = build_model_efficientnet()
# model = load_model('./cian_model_01_B0.h5')
weights_path = get_file(fname='cian_model_01_B0.h5', origin='https://fancyshot.com/wp-content/uploads/model/cian_model_01_B0.h5')
model.load_weights(weights_path)
print('Model Loaded!')


def _fast_expand(img):
    img = image.img_to_array(img) / 255.0
    img = np.expand_dims(img, axis=0)
    return img


def pseudo_download_image(url):
#     print(f'[INFO] Downloading {url}')
    resp = urlopen(url)
    image = np.asarray(bytearray(resp.read()), dtype="uint8")
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)

    return image



def predict_image(url):
    img_size = 320
#     open_cv_image = cv2.imread(img_path)
    open_cv_image = pseudo_download_image(url)
    open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2RGB)
    test_image = cv2.resize(open_cv_image, (img_size, img_size))
    orig_image = _fast_expand(test_image)
    result_orig = model.predict(orig_image, batch_size=1)
    
#     classes = ['bad', 'good']
    # result_idx = np.argmax(result_orig)
    result_val = list(result_orig[0])
    
    return result_val

#Testing prediction
testing_res = predict_image('https://cdn-p.cian.site/images/8/766/147/kvartira-moskva-lomonosovskiy-prospekt-741667887-4.jpg')
print(f'Tested value: {testing_res}')

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, None,  0                                            
__________________________________________________________________________________________________
stem_conv (Conv2D)              (None, None, None, 3 864         input_1[0][0]                    
__________________________________________________________________________________________________
stem_bn (BatchNormalization)    (None, None, None, 3 128         stem_conv[0][0]                  
__________________________________________________________________________________________________
stem_activation (Activation)    (None, None, None, 3 0           stem_bn[0][0]                    
______________________________________________________________________________________________

In [None]:
#3. Predict for all flats from result page
from IPython.core.display import display, HTML

def show_results(scores, threshold=0.5):
    out_html = ['''<table border="1" cellpadding="5" width="100%" class="table table-striped">
                  <tr>
                  <th>Фотография</th>
                  <th>Ссылка</th>
                  <th>Оценка</th>
                  </tr>
                  <tbody>
                ''']


    for f, scr in scores.items():
        img_thumb = scr['thumb']
        f_score = scr['score']
        class_idx = f_score.index(max(f_score))
        if class_idx == 1 and max(f_score) >= threshold:
            out_html.append(f'''<tr>
            <td><a href="{f}" target="_blank">
            <img src="{img_thumb}" width=250 /></a>
            </td>
            <td><a href="{f}" target="_blank">{f}</a></td>
            <td>{f_score}</td></tr>''')

    out_html.append('</tbody></table>')
    print('->>> RESULT STARTS HERE <<<-')
    display(HTML(' '.join(out_html)))

    
### MAIN PART

def FilterAllShitPlease(cian_url, threshold=0.5):
  CIAN_URL = cian_url    
  result_scores = {}
  flats_dict = load_flat_dicts(cian_url)

  for f, urls in flats_dict.items():
      data = {'score': [0, 0], 'thumb': urls[0]}
      total_score = [0, 0]
      for url in urls:
          data['score'] = predict_image(url)
          total_score = [a + b for a, b in zip(total_score, data['score'])]

      final_score = [x/len(urls) for x in total_score]
      data['score'] = final_score
      result_scores[f] = data
      classes = ['bad', 'good']
      print(f'Score for {f} -> {final_score}: This is a {classes[final_score.index(max(final_score))]} flat.')
  
  #ПОКАЗАТЬ ТОЛЬКО ОТСЕЯННЫЕ КВАРТИРЫ. threshold – порог качество от 0 до 1
  show_results(result_scores, threshold=threshold)
    

### MAGIC HAPPENS HERE

In [None]:
### MAGIC HAPPENS HERE
### Здесь просто нужно заменить ссылку на вашу выдачу. Пройдите по ней, чтобы было понятно, что это выдача от cian.ru

CIAN_URL = 'https://www.cian.ru/cat.php?currency=2&deal_type=rent&engine_version=2&maxprice=55000&minprice=40000&offer_type=flat&region=1&room1=1&room2=1&room3=1&type=4'
THRESHOLD = 0.66


FilterAllShitPlease(CIAN_URL, threshold=THRESHOLD)

Score for https://www.cian.ru/rent/flat/220860609/ -> [0.7360602642099062, 0.2639397239157309]: This is a bad flat.
Score for https://www.cian.ru/rent/flat/235619356/ -> [0.37351334777971107, 0.6264866789182028]: This is a good flat.
Score for https://www.cian.ru/rent/flat/235624373/ -> [0.7360017461081346, 0.26399821951054037]: This is a bad flat.
Score for https://www.cian.ru/rent/flat/180821260/ -> [0.57158595820268, 0.4284140144785245]: This is a bad flat.
Score for https://www.cian.ru/rent/flat/235113022/ -> [0.6475733295083046, 0.35242667064691585]: This is a bad flat.
Score for https://www.cian.ru/rent/flat/234678040/ -> [0.09020513373897769, 0.9097948571046194]: This is a good flat.
Score for https://www.cian.ru/rent/flat/235161665/ -> [0.5733294313152631, 0.42667058570077643]: This is a bad flat.
Score for https://www.cian.ru/rent/flat/234682438/ -> [0.484760251827538, 0.5152397487933437]: This is a good flat.
Score for https://www.cian.ru/rent/flat/227759804/ -> [0.4623637050

0,1,2
,https://www.cian.ru/rent/flat/234678040/,"[0.09020513373897769, 0.9097948571046194]"
,https://www.cian.ru/rent/flat/234146738/,"[0.11300702773709219, 0.8869929711023966]"
,https://www.cian.ru/rent/flat/234828601/,"[0.13064512889832258, 0.8693548738956451]"
,https://www.cian.ru/rent/flat/225380222/,"[0.14270060798965764, 0.8572994296749433]"
,https://www.cian.ru/rent/flat/235495514/,"[0.21441595877210298, 0.7855840822060903]"
,https://www.cian.ru/rent/flat/228317527/,"[0.020315752330740604, 0.9796842435995737]"
Фотография,Ссылка,Оценка
