* Сбор данных
 *    Кроллинг
 *    Парсинг
* Очистка данных
 *    Обработка пропусков
 *    Приведение к единому формату
* Разводочный анализ
* Моделирование и интерпретация результатов

In [162]:
import requests
from bs4 import BeautifulSoup

from fake_useragent import UserAgent

import pandas as pd
import time

from tqdm import tqdm_notebook
from ftfy import fix_text

import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import math

# Кроулинг

In [2]:
main_link = "https://krisha.kz/prodazha/kvartiry/astana-esilskij/?das[_sys.hasphoto]=1&das[checked]=1&das[novostroiki]=1&das[who]=1&page="

In [3]:
responce = requests.get(main_link, headers={"User-Agent": UserAgent().chrome})

In [4]:
def get_soup(page_link):
    responce = requests.get(page_link, headers={"User-Agent": UserAgent().chrome})
    soup = BeautifulSoup(responce.content, "html.parser")
    return soup

In [5]:
soup = get_soup(main_link+str(2))

In [6]:
link_raw = soup.findAll("a", attrs={"a-card__title"})

In [7]:
link = [i["href"] for i in link_raw]

In [8]:
def get_link(page_number):
    soup = get_soup(main_link+str(page_number))
    link_raw = soup.findAll("a", attrs={"a-card__title"})
    link = [i["href"] for i in link_raw]
    return link
    

In [213]:
get_link(5)

In [12]:
all_links = []
for i in range(1000):
    links = get_link(i)
    all_links.append(links)

# all_links

In [13]:
with open('all_links.pkl', 'rb') as f:
    all_links = pickle.load(f)

In [214]:
len(all_links)

# Парсинг

In [16]:
page_link = "https://krisha.kz"
main_soup = get_soup("https://krisha.kz/a/show/57857347")

In [215]:
main_soup.find("div", attrs={"class": "offer__price"}).text

## _get_price

In [216]:
def _get_price(main_soup):
    try:
        price = main_soup.find("div", attrs={'class': 'offer__price'}).text
        price = fix_text(price, normalization='NFKC')
        price = price.replace("〒", "").strip()
        price = price.replace(" ", "").strip()
        price = int(price)
    except:
        price = np.nan
    return price

_get_price(main_soup)

## _get_rooms

In [217]:
def _get_rooms(main_soup):
    try:
        tittle = main_soup.find("h1")
        tittle = tittle.text.strip()
        tittle = tittle.split(",")[0]
        tittle = tittle.replace("-комнатная квартира", "")
        tittle = int(tittle)
    except:
        tittle = np.nan
    return tittle

_get_rooms(main_soup)

## _get_house_name

In [218]:
def _get_house_name(main_soup):
    try:
        house_name = main_soup.find("div", attrs={'data-name': 'map.complex'})
        house_name = house_name.text.replace("Жилой комплекс", "").strip()
    except:
        house_name = np.nan
    return house_name

_get_house_name(main_soup)

## _get_square

In [219]:
def _get_square(main_soup):
    try:
        square = main_soup.find("div", {"data-name": "live.square"}).text
        square = [int(s) for s in square.split() if s.isdigit()]
    except:
        square = np.nan
    
    return square

_get_square(main_soup)

## _get_ceiling

In [220]:
def _get_ceiling(main_soup):
    try:
        ceiling = main_soup.find("div", {"data-name": "ceiling"})
        ceiling = ceiling.text.replace("Потолки", "").replace("м", "").strip()
        ceiling = int(ceiling)
    except:
        ceiling = np.nan
    return ceiling

_get_ceiling(main_soup)

## _get_year_built

In [221]:
def _get_year_built(main_soup):
    try:
        year_built = main_soup.find("div", {"data-name": "flat.building"}).text
        year_built = [int(s) for s in year_built.split() if s.isdigit()][0]
    except:
        year_built = np.nan
    return year_built

_get_year_built(main_soup)

## _get_floor

In [222]:
def _get_floor(main_soup):
    try:
        floor = main_soup.find("div", {"data-name": "flat.floor"})
        floor = floor.text.replace("Этаж", "").strip().split(" из ")
    except:
        floor = np.nan
        
    return floor

_get_floor(main_soup)

## _get_renovation

In [223]:
def _get_renovation(main_soup):
    try:
        renovation = main_soup.find("div", {"data-name": "flat.renovation"})
        renovation = renovation.text.replace("Состояние", "").strip()
    except:
        renovation = np.nan
    return renovation

_get_renovation(main_soup)

# Объединение

In [26]:
def get_page_info(link):
    main_soup = get_soup(page_link + link)
    
    price = _get_price(main_soup)
    renovation = _get_renovation(main_soup)
    floor = _get_floor(main_soup)
    year_built = _get_year_built(main_soup)
    ceiling= _get_ceiling(main_soup)
    square = _get_square(main_soup)
    rooms = _get_rooms(main_soup)
    house_name = _get_house_name(main_soup)
    
    
    info = {
        'price': price,
        'renovation':renovation,
        'floor':floor,
        'year_built':year_built,
        'ceiling':ceiling,
        'square':square,
        'rooms':rooms,
        'house_name':house_name,
    }
    
    return info

In [224]:
get_page_info('/a/show/57300188')

In [29]:
news_dataset = pd.DataFrame(columns=['renovation', 'floor', 'year_built', 'ceiling', 'square', 'rooms', 'house_name'])

for link in tqdm_notebook(all_links):
    news_dataset = news_dataset.append(get_page_info(link), ignore_index=True)
    time.sleep(0.1)

# Очистка данных

In [225]:
data = pd.read_pickle("news_dataset.pkl")
print(data.shape)
data.head()

## Обработка пропусков

In [52]:
data = data[data.price.isna()==False]

In [226]:
data.price = data.price / 1000000
data.head(3)

In [54]:
data.square.fillna("[]", inplace=True)
data.floor.fillna("[]", inplace=True)

In [227]:
data.ceiling = data.ceiling.fillna(2.75)
data.head(3)

In [228]:
data.house_name.fillna("Unknown", inplace=True)
data.head(3)

In [229]:
data.renovation.fillna("Unknown", inplace=True)
data.head(3)

## Приведение к единому формату

In [230]:
data.head(3)

In [231]:
data["house_age"] = 2020 - data["year_built"].astype(int)
data.head(3)

In [232]:
square_total, square_living, square_kitchen = [], [], []
for ind, row in data.iterrows():
    if "]" in row["square"]:
        i = []
    else:
        i = row["square"]
        j = int(row["rooms"]) * 30
    square_n = len(i)
    if square_n==0:
        square_total.append(j)
        square_living.append(j * 70 / 100)
        square_kitchen.append(j * 20 / 100)
    elif square_n==1:
        square_total.append(i[0])
        square_living.append(j * 70 / 100)
        square_kitchen.append(j * 20 / 100)
    elif square_n==2:
        square_total.append(i[0])
        square_living.append(i[1])
        square_kitchen.append(j * 20 / 100)
    elif square_n==3:
        square_total.append(i[0])
        square_living.append(i[1])
        square_kitchen.append(i[2])
        
data["square_total"] = square_total
data["square_living"] = square_living
data["square_kitchen"] = square_kitchen

data.head()

In [233]:
floor_flat, floors_total = [], []
for i in data.floor:
    if i == '[]':
        i = []
    floor_n = len(i)
    if floor_n == 0:
        floor_flat.append(np.nan)
        floors_total.append(np.nan)
    elif floor_n==1:
        floor_flat.append(int(i[0]))
        floors_total.append(np.nan)
    elif floor_n==2:
        floor_flat.append(int(i[0]))
        floors_total.append(int(i[1]))
        
data["floor_flat"] = floor_flat
data["floors_total"] = floors_total

data.head()

In [62]:
data.year_built = data.year_built.astype(int)
data.rooms = data.rooms.astype(int)

In [234]:
data.drop(["priv_dorm", "floor", "square", "year_built"], axis=1, inplace=True)
data.head(3)

In [235]:
data.info()

# Разведочный анализ

In [236]:
data.info()
data[data.square_total==0]

In [80]:
data["price_square"] = round(data["price"] * 1000000 / data["square_total"],2)

In [238]:
data.house_name.value_counts().to_frame("cnt").sort_values(by="cnt" , ascending=False).head(10)

In [239]:
# 1х комнатная
table = data[data.rooms==1].groupby("house_name").agg({"price": "mean"}).reset_index(
).sort_values(by="price", ascending=False).head(20)
plt.figure(figsize=(15,7))
sns.barplot(data=table, x="price", y = "house_name")
plt.show()

In [240]:
# 2х комнатная
table = data[data.rooms==2].groupby("house_name").agg({"price": "mean"}).reset_index(
).sort_values(by="price", ascending=False).head(20)
plt.figure(figsize=(15,7))
sns.barplot(data=table, x="price", y = "house_name")
plt.show()

In [241]:
#3х комнатная
table = data[data.rooms==3].groupby("house_name").agg({"price": "mean"}).reset_index(
).sort_values(by="price", ascending=False).head(20)
plt.figure(figsize=(15,7))
sns.barplot(data=table, x="price", y = "house_name")
plt.show()

In [242]:
# 4х комнатная
table = data[data.rooms==4].groupby("house_name").agg({"price": "mean"}).reset_index(
).sort_values(by="price", ascending=False).head(20)
plt.figure(figsize=(15,7))
sns.barplot(data=table, x="price", y = "house_name")
plt.show()

In [243]:
for i in range(1,7):
    print("{} - комнатная".format(i))
    plt.figure(figsize=(15,i))
    sns.boxplot(data=data[data.rooms==4][["price"]], orient="h")
    plt.show()

# Моделирование и интерпретация результатов

In [133]:
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
data.info()

In [150]:
data.drop("house_name", axis=1, inplace=True)

In [179]:
data.dropna(inplace=True)

In [180]:
scaler = StandardScaler()
m_linear = LinearRegression()

In [182]:
data_renovation = pd.get_dummies(data["renovation"])

In [183]:
df = pd.concat([data, data_renovation], axis=1).drop("renovation", axis=1)

In [252]:
df = df[df.price<150]
df.head()

In [185]:
X = df.drop("price", axis=1)
y = df["price"]

In [245]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [244]:
m_linear.fit(X_train, y_train)

In [189]:
def model_scores(modelName):
    y_pred = modelName.predict(X_test)
    mse = "\nMSE: {:.4f}".format(mean_squared_error(y_pred, y_test))
    rmse = "\nMSE: {:.4f}".format(math.sqrt(mean_squared_error(y_pred, y_test)))
    return mse, rmse

In [246]:
model_scores(m_linear)

In [192]:
## Результат

In [194]:
y_pred = m_linear.predict(X_test)

result = pd.DataFrame([y_pred, y_test.values]).T
result.columns = ["actual", "predicted"]
result = result.reset_index()
result = pd.melt(result, id_vars=["index"], value_vars=["actual", "predicted"])

In [195]:
result = result.sort_values(by="index").head(50)

In [247]:
plt.figure(figsize=(15,7))
sns.barplot(x="index", hue="variable", y="value", data=result)

In [200]:
Importance = m_linear.coef_
Columns = X_train.columns

In [248]:
imp = pd.DataFrame([Columns, Importance]).T
imp.columns = ["feature", "coef"]
imp.sort_values(by="coef", inplace=True, ascending=False)
imp.head()

In [251]:
plt.figure(figsize=(15,6))
sns.barplot(data=imp, x="coef", y="feature")