* Сбор данных
 *    Кроллинг
 *    Парсинг
* Очистка данных
 *    Обработка пропусков
 *    Приведение к единому формату
* Разводочный анализ
* Моделирование и интерпретация результатов

In [3]:
import requests
from bs4 import BeautifulSoup

from fake_useragent import UserAgent

import pandas as pd
import time

from tqdm import tqdm_notebook
from ftfy import fix_text

import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import math

# Кроулинг

In [29]:
with open('all_links.pkl', 'rb') as f:
    all_links = pickle.load(f)

In [32]:
# all_links

# Парсинг

In [49]:
page_link = "https://krisha.kz"
main_soup = get_soup("https://krisha.kz/a/show/57859710")

In [154]:
main_soup.find("div", attrs={'class': 'offer__price'}).text

## _get_price

In [153]:
def _get_price(main_soup):
    try:
        price = main_soup.find("div", attrs={'class': 'offer__price'}).text
        price = fix_text(price, normalization='NFKC')
        price = price.replace("〒", "").strip()
        price = price.replace(" ", "").strip()
        price = int(price)
    except:
        price = np.nan
    return price

_get_price(main_soup)

## _get_rooms

In [152]:
def _get_rooms(main_soup):
    try:
        tittle = main_soup.find("h1")
        tittle = tittle.text.strip()
        tittle = tittle.split(",")[0]
        tittle = tittle.replace("-комнатная квартира", "")
        tittle = int(tittle)
    except:
        tittle = np.nan
    return tittle

_get_rooms(main_soup)

## _get_house_name

In [151]:
def _get_house_name(main_soup):
    try:
        house_name = main_soup.find("div", attrs={'data-name': 'map.complex'})
        house_name = house_name.text.replace("Жилой комплекс", "").strip()
    except:
        house_name = np.nan
    return house_name

_get_house_name(main_soup)

## _get_square

In [150]:
def _get_square(main_soup):
    try:
        square = main_soup.find("div", {"data-name": "live.square"}).text
        square = [int(s) for s in square.split() if s.isdigit()]
    except:
        square = np.nan
    
    return square

_get_square(main_soup)

## _get_ceiling

In [149]:
def _get_ceiling(main_soup):
    try:
        ceiling = main_soup.find("div", {"data-name": "ceiling"})
        ceiling = ceiling.text.replace("Потолки", "").replace("м", "").strip()
        ceiling = int(ceiling)
    except:
        ceiling = np.nan
    return ceiling

_get_ceiling(main_soup)

## _get_year_built

In [148]:
def _get_year_built(main_soup):
    try:
        year_built = main_soup.find("div", {"data-name": "flat.building"}).text
        year_built = [int(s) for s in year_built.split() if s.isdigit()][0]
    except:
        year_built = np.nan
    return year_built

_get_year_built(main_soup)

## _get_floor

In [147]:
def _get_floor(main_soup):
    try:
        floor = main_soup.find("div", {"data-name": "flat.floor"})
        floor = floor.text.replace("Этаж", "").strip().split(" из ")
    except:
        floor = np.nan
        
    return floor

_get_floor(main_soup)

## _get_renovation

In [146]:
def _get_renovation(main_soup):
    try:
        renovation = main_soup.find("div", {"data-name": "flat.renovation"})
        renovation = renovation.text.replace("Состояние", "").strip()
    except:
        renovation = np.nan
    return renovation

_get_renovation(main_soup)

# Объединение

In [60]:
def get_page_info(link):
    main_soup = get_soup(page_link + link)
    
    price = _get_price(main_soup)
    renovation = _get_renovation(main_soup)
    floor = _get_floor(main_soup)
    year_built = _get_year_built(main_soup)
    ceiling= _get_ceiling(main_soup)
    square = _get_square(main_soup)
    rooms = _get_rooms(main_soup)
    house_name = _get_house_name(main_soup)
    
    
    info = {
        'price': price,
        'renovation':renovation,
        'floor':floor,
        'year_built':year_built,
        'ceiling':ceiling,
        'square':square,
        'rooms':rooms,
        'house_name':house_name,
    }
    
    return info

In [145]:
get_page_info('/a/show/57859710')

In [64]:
news_dataset = pd.DataFrame(columns=['renovation', 'floor', 'year_built', 'ceiling', 'square', 'rooms', 'house_name'])

for link in tqdm_notebook(all_links):
    news_dataset = news_dataset.append(get_page_info(link), ignore_index=True)
    time.sleep(0.1)

# Очистка данных

In [144]:
data = pd.read_pickle("news_dataset.pkl").drop("priv_dorm", axis=1)
print(data.shape)
data.head()

## Обработка пропусков

In [67]:
data = data[data.price.isna()==False]

In [143]:
data.price = data.price / 1000000
data.head(3)

In [69]:
data.square.fillna("[]", inplace=True)
data.floor.fillna("[]", inplace=True)

In [71]:
# data.ceiling.value_counts()

In [142]:
data.ceiling = data.ceiling.fillna(2.75)
data.head(3)

In [141]:
data.house_name.fillna("Unknown", inplace=True)
data.head(3)

In [140]:
data.renovation.fillna("Unknown", inplace=True)
data.head(3)

## Приведение к единому формату / Новые переменные

In [139]:
data.head(3)

In [138]:
data["house_age"] = 2020 - data["year_built"].astype(int)
data.head(7)

In [137]:
square_total, square_living, square_kitchen = [], [], []
for ind, row in data.iterrows():
    if "]" in row["square"]:
        i = []
    else:
        i = row["square"]
        j = int(row["rooms"]) * 30
    square_n = len(i)
    if square_n==0:
        square_total.append(j)
        square_living.append(j * 70 / 100)
        square_kitchen.append(j * 20 / 100)
    elif square_n==1:
        square_total.append(i[0])
        square_living.append(j * 70 / 100)
        square_kitchen.append(j * 20 / 100)
    elif square_n==2:
        square_total.append(i[0])
        square_living.append(i[1])
        square_kitchen.append(j * 20 / 100)
    elif square_n==3:
        square_total.append(i[0])
        square_living.append(i[1])
        square_kitchen.append(i[2])
        
data["square_total"] = square_total
data["square_living"] = square_living
data["square_kitchen"] = square_kitchen

data.head()

In [136]:
floor_flat, floors_total = [], []
for i in data.floor:
    if i == '[]':
        i = []
    floor_n = len(i)
    if floor_n == 0:
        floor_flat.append(np.nan)
        floors_total.append(np.nan)
    elif floor_n==1:
        floor_flat.append(int(i[0]))
        floors_total.append(np.nan)
    elif floor_n==2:
        floor_flat.append(int(i[0]))
        floors_total.append(int(i[1]))
        
data["floor_flat"] = floor_flat
data["floors_total"] = floors_total

data.head()

In [81]:
data.year_built = data.year_built.astype(int)
data.rooms = data.rooms.astype(int)

In [135]:
data.drop(["floor", "square", "year_built"], axis=1, inplace=True)
data.head(3)

# Разведочный анализ

In [134]:
data.info()
data[data.square_total==0]

In [133]:
data["price_square"] = round(data["price"] * 1000000 / data["square_total"],2)
data.head()

In [132]:
data.house_name.value_counts().to_frame("cnt").sort_values(by="cnt" , ascending=False).head(15)

In [131]:
# 1х комнатная
table = data[data.rooms==1].groupby("house_name").agg({"price": "mean"}).reset_index(
).sort_values(by="price", ascending=False).head(20)
plt.figure(figsize=(15,7))
sns.barplot(data=table, x="price", y = "house_name")
plt.show()

In [130]:
# 2х комнатная
table = data[data.rooms==2].groupby("house_name").agg({"price": "mean"}).reset_index(
).sort_values(by="price", ascending=False).head(20)
plt.figure(figsize=(15,7))
sns.barplot(data=table, x="price", y = "house_name")
plt.show()

In [129]:
#3х комнатная
table = data[data.rooms==3].groupby("house_name").agg({"price": "mean"}).reset_index(
).sort_values(by="price", ascending=False).head(20)
plt.figure(figsize=(15,7))
sns.barplot(data=table, x="price", y = "house_name")
plt.show()

In [128]:
# 4х комнатная
table = data[data.rooms==4].groupby("house_name").agg({"price": "mean"}).reset_index(
).sort_values(by="price", ascending=False).head(20)
plt.figure(figsize=(15,7))
sns.barplot(data=table, x="price", y = "house_name")
plt.show()

In [127]:
for i in range(1,7):
    print("{} - комнатная".format(i))
    plt.figure(figsize=(15,3))
    sns.boxplot(data=data[data.rooms==i][["price"]], orient="h")
    plt.show()

# Моделирование и интерпретация результатов

In [97]:
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [126]:
data.info()

In [99]:
data.drop("house_name", axis=1, inplace=True)

In [100]:
data.dropna(inplace=True)

In [101]:
scaler = StandardScaler()
m_linear = LinearRegression()

In [102]:
data_renovation = pd.get_dummies(data["renovation"])

In [103]:
df = pd.concat([data, data_renovation], axis=1).drop("renovation", axis=1)

In [125]:
df = df[df.price<150]
df.head()

In [105]:
X = df.drop("price", axis=1)
y = df["price"]

In [106]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(7632, 16) (1908, 16) (7632,) (1908,)


In [121]:
m_linear.fit(X_train, y_train)

In [108]:
def model_scores(modelName):
    y_pred = modelName.predict(X_test)
    mse = "\nMSE: {:.4f}".format(mean_squared_error(y_pred, y_test))
    rmse = "\nMSE: {:.4f}".format(math.sqrt(mean_squared_error(y_pred, y_test)))
    return mse, rmse

In [122]:
model_scores(m_linear)

In [110]:
## Результат

In [111]:
y_pred = m_linear.predict(X_test)

result = pd.DataFrame([y_pred, y_test.values]).T
result.columns = ["actual", "predicted"]
result = result.reset_index()
result = pd.melt(result, id_vars=["index"], value_vars=["actual", "predicted"])

In [112]:
result = result.sort_values(by="index").head(50)

In [120]:
plt.figure(figsize=(15,7))
sns.barplot(x="index", hue="variable", y="value", data=result)

In [118]:
Importance = m_linear.coef_
Columns = X_train.columns

imp = pd.DataFrame([Columns, Importance]).T
imp.columns = ["feature", "coef"]
imp.sort_values(by="coef", inplace=True, ascending=False)
imp.head()

Unnamed: 0,feature,coef
10,евроремонт,3.1367
9,Unknown,0.754714
3,square_total,0.531046
0,ceiling,0.358674
15,черновая отделка,0.308715


In [119]:
plt.figure(figsize=(15,6))
sns.barplot(data=imp, x="coef", y="feature")