In [4]:
import vk_api
import re
from datetime import date
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import urllib.request
import numpy as np
from matplotlib import pyplot as plt
from dotenv import load_dotenv
import os
%matplotlib inline

In [23]:
load_dotenv()
vk_session = vk_api.VkApi(token=os.environ.get('VK_ACCESS_TOKEN'), api_version="5.131")
vk = vk_session.get_api()

In [None]:
communities = [
    {
        "name": "Город Орел! Орловчане ВКонтакте",
        "domain": "vk_orel"
    },
    {
        "name": "«АВТОСТИКЕР» - Наклейки на авто | Аксессуары",
        "domain": "autosticker57"
    },
    {
        "name": "ИНЦИДЕНТ | ОРЁЛ",
        "domain": "orel_onlain"
    },
    {
        "name": "Найди меня - Орёл",
        "domain": "naidi_orel"
    },
    {
        "name": "Интересный город Орел | Орловчане!",
        "domain": "interesting_orel"
    }
]

def download_vk_data(communities):
    rows = []
    for c in communities:
        offset = 0
        finish = False
        while not finish:
            members = vk.groups.get_members(group_id=c['domain'], offset=offset, fields="id,bdate,photo_200", count=1000)
            print(c['domain'], offset)
            if len(members['items']) <= 0:
                finish = True
            else:
                for member in members["items"]:
                    row = {
                        "id": member["id"],
                        "group_id": c["domain"],
                        "bdate": member["bdate"] if "bdate" in member else None,
                        "photo_200": member["photo_200"] if "photo_200" in member else None

                    }
                    rows.append(row)
            offset += 1000
    return pd.DataFrame(columns=[
        'id',
        'group_id',
        'bdate',
        'photo_200'
        ], data=rows)

df = download_vk_data(communities)
df.to_csv("data.csv", index=False)

In [2]:
def try_parse_date(date_str):
    if date_str and re.match(r"^(\d){1,2}[.]{1}(\d){1,2}[.]{1}(\d){4}$", date_str):
        [day, month, year] = date_str.split(".")
        return date(int(year), int(month), int(day))
    else:
        return None
    
def make_haar_cascade_clf():
    try:
        xml_path = "haarcascade_frontalface_default.xml"
        return cv2.CascadeClassifier(xml_path)
    except:
        return None

def has_faces(url, clf):
    if not clf:
        return False
    try:
        req = urllib.request.urlopen(url)
        arr = np.asarray(bytearray(req.read()), dtype=np.uint8)
        img = cv2.imdecode(arr, -1)
        grayImg = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        faces = clf.detectMultiScale(grayImg, 1.3, 4)
        return len(list(faces)) > 0
    except:
        return False

clf = make_haar_cascade_clf()

df = pd.read_csv("data.csv", delimiter=",")
df['bdate'] = df['bdate'].apply(lambda elem: try_parse_date(str(elem)))
df['has_face'] = df['photo_200'].apply(lambda elem: has_faces(str(elem), clf))
df.to_csv("data_processed.csv", index=False)

In [None]:
df = pd.read_csv("data_processed.csv", delimiter=",")
groups = df.groupby("group_id")

names = [
    "Город Орел!\nОрловчане\nВКонтакте\n(vk.com/vk_orel)",
    "«АВТОСТИКЕР»\nНаклейки на авто\n| Аксессуары\n(vk.com/autosticker57)",
    "ИНЦИДЕНТ\n| ОРЁЛ\n\n(vk.com/orel_onlain)",
    "Найди меня -\nОрёл\n\n(vk.com/naidi_orel)",
    "Интересный город\nОрел | Орловчане!\n\n(vk.com/interesting_orel)"]
domains = ["vk_orel", "autosticker57", "orel_onlain", "naidi_orel", "interesting_orel"]

counts = []
has_bdates = []
has_faces = []

for domain in domains:
    group = groups.get_group(domain)
    has_bdates.append(len(group['bdate']) - group['bdate'].count())
    has_faces.append(len(group['has_face']) - group[group['has_face'] == False]['has_face'].count())
    counts.append(len(group['bdate']))
cdf = pd.DataFrame({
    "title": names,
    "Всего": counts,
    "Возраст указан": has_bdates,
    "Есть лицо на фото": has_faces
})
cdf = cdf.melt(id_vars = "title", var_name='Количество пользователей', value_name='value')
fig, ax = plt.subplots(figsize=(14,8))
fig.set(facecolor = 'white')
sns.barplot(data=cdf, x="title", y="value", hue="Количество пользователей", ax=ax)
ax.set_xlabel("\nСообщество VK", fontname="serif", fontsize=12)
ax.set_ylabel("Количество пользователей", fontname="serif", fontsize=12)
ax.yaxis.grid(True)
ax.xaxis.grid(True)
for tick in ax.get_xticklabels():
    tick.set_fontname("serif")
    tick.set_fontsize(12)
for tick in ax.get_yticklabels():
    tick.set_fontname("serif")
    tick.set_fontsize(12)