In [None]:
!pip install vk_api
import vk_api
import re
from datetime import datetime
from dateutil.relativedelta import relativedelta
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
auth_token="..."
vk_session = vk_api.VkApi(token=auth_token, api_version="5.131")
vk = vk_session.get_api()
folder = '[папка с csv файлами]'

In [None]:
def download_vk_data(screen_names):
    rows = []
    rows_posts = []
    one_year_ago = int((datetime.now() - relativedelta(months=12)).timestamp())
    for screen_name in screen_names:
        offset = 0
        finish = False
        row = {
            'screen_name': screen_name,
            'total': 0,
            'text': 0,
            'video': 0,
            'photo': 0,
            'audio': 0,
            'poll': 0,
            'other': 0,
            'text_mean': 0
        }
        data = vk.utils.resolveScreenName(screen_name=screen_name)
        owner_id = -1 * int(data["object_id"]) if data["type"] in ["page", "group"] else int(data["object_id"])
        while not finish:
            posts = vk.wall.get(owner_id=owner_id, offset=offset, count=100)
            if len(posts['items']) <= 0:
                finish = True
            else:
                for post in posts['items']:
                    row_post = {
                        'screen_name': screen_name,
                        'post_id': post['id'],
                        'has_text': False,
                        'has_video': False,
                        'has_photo': False,
                        'has_audio': False,
                        'has_poll': False,
                        'has_other': False,
                        'comments_count': 0,
                        'reposts_count': 0,
                        'views_count': 0,
                        'likes_count': 0
                    }
                    row['total'] += 1
                    if post['date'] < one_year_ago:
                        finish = True
                        break
                    if len(post['text']) > 0:
                        row['text'] += 1
                        row['text_mean'] += len(post['text'])
                        row_post['has_text'] = True
                    if 'comments' in post:
                        row_post['comments_count'] = post['comments']['count']
                    if 'likes' in post:
                        row_post['likes_count'] = post['likes']['count']
                    if 'views' in post:
                        row_post['views_count'] = post['views']['count']
                    if 'reposts' in post:
                        row_post['reposts_count'] = post['reposts']['count']
                    attachments_types = set()
                    for attachment in post['attachments']:
                        attachments_types.add(attachment['type'])
                    for attachment_type in attachments_types:
                        if attachment_type == 'video':
                            row['video'] += 1
                            row_post['has_video'] = True
                        elif attachment_type in ['photo', 'posted_photo', 'album', 'photos_list']:
                            row['photo'] += 1
                            row_post['has_photo'] = True
                        elif attachment_type == 'audio':
                            row['audio'] += 1
                            row_post['has_audio'] = True
                        elif attachment_type == 'poll':
                            row['poll'] += 1
                            row_post['has_poll'] = True
                        else:
                            row['other'] += 1
                            row_post['has_other'] = True
                    rows_posts.append(row_post)
                offset += 100
                row['text_mean'] = int(row['text_mean'] / row['total'])

        rows.append(row)
    return (pd.DataFrame(columns=[
        'screen_name',
        'total',
        'text',
        'video',
        'photo',
        'audio',
        'poll',
        'other',
        'text_mean'
        ], data=rows),
        pd.DataFrame(columns=[
            'screen_name',
            'post_id',
            'has_text',
            'has_video',
            'has_photo',
            'has_audio',
            'has_poll',
            'has_other',
            'comments_count',
            'reposts_count',
            'views_count',
            'likes_count'
        ], data=rows_posts))

In [None]:
input_df = pd.read_csv(folder + "губернаторы.csv", delimiter=",")
input_df['governor_screen_name'] = input_df['link_governor'].apply(lambda elem: re.sub(r"^(https://vk\.com/)", "", str(elem)))
input_df['administration_screen_name'] = input_df['link_administration'].apply(lambda elem: re.sub(r"^(https://vk\.com/)", "", str(elem)))
governor_screen_names = list(input_df[input_df['governor_screen_name'] != 'nan']['governor_screen_name'])
administration_screen_names = list(input_df[input_df['administration_screen_name'] != 'nan']['administration_screen_name'])
data, data_posts = download_vk_data(governor_screen_names + administration_screen_names)
data.to_csv(folder + "data.csv", index=False)
data_posts.to_csv(folder + "data_post.csv", index=False)

In [None]:
def prepare_df(df, drop_columns, rename_columns):
    X = df.drop(columns=drop_columns)
    X.rename(columns=rename_columns, inplace=True)
    return X

def plot(df, x_label, y_label):
    fig, ax = plt.subplots(figsize=(10,6))
    fig.set(facecolor = 'white')
    sns.boxplot(data=df, ax=ax, linewidth=1)
    ax.set_xlabel(x_label, fontname="serif", fontsize=12)
    ax.set_ylabel(y_label, fontname="serif", fontsize=12)
    ax.yaxis.grid(True)
    ax.xaxis.grid(True)
    for tick in ax.get_xticklabels():
        tick.set_fontname("serif")
        tick.set_fontsize(12)
    for tick in ax.get_yticklabels():
        tick.set_fontname("serif")
        tick.set_fontsize(12)
    plt.show()

In [None]:
df = pd.read_csv(folder + "data.csv", delimiter=",")

content_drop_columns = ['text_mean']
content_rename_columns = {
    "total": "Всего",
    "text": "Текст",
    "video": "Видео",
    "photo": "Фото",
    "audio": "Аудио",
    "poll": "Опрос",
    "other": "Другое"
}

content_y_label = "Количество публикаций"
content_x_label = "Тип контента"

plot(prepare_df(df, content_drop_columns, content_rename_columns), content_x_label, content_y_label)

In [None]:
df = pd.read_csv(folder + "data_post.csv", delimiter=",")
video_df = df.loc[(df['has_video'] == True) & (df['has_photo'] == False)  & (df['has_audio'] == False) & (df['has_poll'] == False) & (df['has_other'] == False)]
photo_df = df.loc[(df['has_video'] == False) & (df['has_photo'] == True)  & (df['has_audio'] == False) & (df['has_poll'] == False) & (df['has_other'] == False)]
photo_or_video_df = df.loc[(df['has_video'] == True) | (df['has_photo'] == True)]
no_photo_or_video_df = df.loc[(df['has_video'] == False) & (df['has_photo'] == False)]

print(len(df), len(video_df), len(photo_df), len(photo_or_video_df), len(no_photo_or_video_df))


# values = [('comments_count', "Количество комментариев"), ('views_count', "Количество просмотров"), ('likes_count', "Количество лайков"), ('reposts_count', "Количество репостов"),]
# h_values = [('all', "Всего", 0.1, 0.9), ('video', "Видео", 0.1, 0.9), ('photo', "Фото", 0.1, 0.9), ('photo_or_video', "Фото или видео", 0.1, 0.9), ('no_photo_or_video', "Ни фото, ни видео", 0, 1)]
# values = [('comments_count', "Количество (тыс.)")]
# values = [('views_count', "Количество (тыс.)")]
# values = [('likes_count', "Количество (тыс.)")]
values = [('reposts_count', "Количество (тыс.)")]
h_values = [('all', "Всего", 0.1, 0.9), ('video', "Видео", 0.1, 0.9), ('photo', "Фото", 0.1, 0.9)]
# h_values = [('all', "Всего (тыс.)", 0.1, 0.9), ('video', "Видео (тыс.)", 0.1, 0.9), ('photo', "Фото (тыс.)", 0.1, 0.9)]

# fig = plt.figure(figsize=(24,16))
# fig = plt.figure(figsize=(16,4), nrows=1, ncols=3)
fig, axes = plt.subplots(nrows=1, ncols=6, figsize=(16, 4))
fig.set(facecolor = 'white')
fig.tight_layout()

for i, (value, title) in enumerate(values):
    prepared_df = pd.DataFrame({
        "all": df[value],
        "video": video_df[value],
        "photo": photo_df[value],
        "photo_or_video": photo_or_video_df[value],
        "no_photo_or_video": no_photo_or_video_df[value]
    }).fillna(0)
    for j, (h_value, h_title, q1, q3) in enumerate(h_values):
        Q1 = prepared_df[h_value].quantile(q1)
        Q3 = prepared_df[h_value].quantile(q3)
        new_df = prepared_df[(prepared_df[h_value] > Q1) & (prepared_df[h_value] < Q3)]
        ax = plt.subplot2grid((len(values), len(h_values)),(i,j))
        ax.set_xlabel(h_title, fontname="serif", fontsize=14)
        ax.set_ylabel(title, fontname="serif", fontsize=14)
        # ax.set_ylabel(" ", fontname="serif", fontsize=14)
        ax.yaxis.grid(True)
        ax.xaxis.grid(True)
        sns.histplot(x=new_df[h_value], kde=True, bins=10)
        for tick in ax.get_xticklabels():
            tick.set_fontname("serif")
            tick.set_fontsize(13)
        for tick in ax.get_yticklabels():
            tick.set_fontname("serif")
            tick.set_fontsize(13)
        # xlabels = ['{:,.1f}'.format(x) for x in ax.get_xticks()/1000]
        # ax.set_xticklabels(xlabels)
        # ylabels = ['{:,.1f}'.format(y) for y in ax.get_yticks()/1000]
        # ax.set_yticklabels(ylabels)
plt.show()


In [None]:
def pie_plot(s):
    fig = plt.figure(figsize=(12,8))
    fig.set(facecolor = 'white')
    i = 0
    for index, val in s.items():
        ax = plt.subplot2grid((2,4),(0,i))
        plt.pie(x=[val, 1-val], normalize=False, startangle=90)
        plt.title(index, fontname="serif", fontsize=12)
        ax.set_xlabel(f"{round(val*100,2)}%", fontname="serif", fontsize=12)
        for tick in ax.get_xticklabels():
            tick.set_fontname("serif")
            tick.set_fontsize(12)
        centre_circle = plt.Circle((0,0),0.70,fc='white')
        fig = plt.gcf()
        fig.gca().add_artist(centre_circle)
        i += 1
    plt.show()

df = pd.read_csv(folder + "data.csv", delimiter=",")

mdf = df.mean()
text = mdf['text'] / mdf['total']
video = mdf['video'] / mdf['total']
photo = mdf['photo'] / mdf['total']
other = (mdf['other'] + mdf['poll'] + mdf['audio']) / mdf['total']
pie_plot(pd.Series(data=[text, video, photo, other], index=['Текст','Видео','Фото','Другое']))

In [None]:
df = pd.read_csv(folder + "data.csv", delimiter=",")

fig, ax = plt.subplots(figsize=(11,6))
fig.set(facecolor = 'white')
sns.histplot(x=df["text_mean"], kde=True, bins=30)
ax.set_xlabel("Средняя длина сообщения", fontname="serif", fontsize=12)
ax.set_ylabel("Количество публикаций", fontname="serif", fontsize=12)
ax.yaxis.grid(True)
ax.xaxis.grid(True)
for tick in ax.get_xticklabels():
    tick.set_fontname("serif")
    tick.set_fontsize(12)
for tick in ax.get_yticklabels():
    tick.set_fontname("serif")
    tick.set_fontsize(12)
plt.show()