In [2]:
import pymongo
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from dateutil.relativedelta import *
from datetime import date, datetime
import pickle
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# client = pymongo.MongoClient('mongodb://localhost:27017/')
# db = client["vk"]

# db.users.create_index([( "user.id", 1 )])
# db.posts.create_index([( "id", 1 )])
# db.posts.create_index([( "owner_id", 1 )])
# db.comments.create_index([( "id", 1 )])
# db.comments.create_index([( "from_id", 1 )])
# db.comments.create_index([( "post_id", 1 )])

In [3]:
input_df = pd.read_csv("ages.csv", delimiter=";")
columns = [
    "class",                        # класс 1-5
    "has_photo",                    # есть ли фото у пользователя
    "followers_count",              # количество подписчиков
    "has_occupation",               # есть ли работа
    "friends_count",                # количество друзей
    "online",                       # сколько раз был онлайн за время наблюдения
    "comments_count",               # количество комментариев пользователя
    "comment_stickers_count",       # среднее количество стикеров в комментариях пользователя
    "comment_text_length",          # средняя длина комментария пользователя
    "mean_friend_online",           # среднее количество, сколько раз друзья были онлайн
    "min_friend_online",            # минимальное 
    "max_friend_online",            # максимальное
    "mean_friend_age",              # средний возраст друга
    "min_friend_age",               # минимальный
    "max_friend_age",               # максимальный
    "mean_friend_has_photo",        # среднее количество друзей с фото
    "min_friend_has_photo",         # минимальное
    "max_friend_has_photo",         # максимальное
    "mean_friend_followers_count",  # среднее количество подписчиков у друзей
    "min_friend_followers_count",   # минимальное
    "max_friend_followers_count",   # максимальное
    "mean_comments_count",          # среднее количество комментариев в постах в группах пользователя
    "min_comments_count",           # минимальное
    "max_comments_count",           # максимальное
    "mean_has_attachments",         # среднее количество постов с вложениями в группах пользователя
    "min_has_attachments",          # минимальное
    "max_has_attachments",          # максимальное
    "mean_likes_count",             # среднее количество лайков поста в группах пользователя
    "min_likes_count",              # минимальное
    "max_likes_count",              # максимальное
    "mean_reposts_count",           # среднее количество репостов поста в группах пользователя
    "min_reposts_count",            # минимальное
    "max_reposts_count",            # максимальное
    "mean_views_count",             # среднее количество просмотров поста в группах пользователя
    "min_views_count",              # минимальное 
    "max_views_count"]              # максимальное


def generate_data_for_user(user_id: int, user_class: int):
    record = db.users.find_one({ "user.id": user_id, "subscriptions": {"$exists":True}, "friends": {"$exists":True} })
    if not record:
        raise ValueError(f"No such user with id {user_id}, subscriptions and friends lists")
    tmp = list(db.users.aggregate([
        {"$match": { "user.id": user_id } }, 
        {"$unwind": '$user'}, 
        {"$group": {
            "_id": "$user.id", 
            "sum": {"$sum": "$user.online" }
        }}
    ]))
    online = tmp[0]["sum"] if len(tmp) > 0 else 0
    friends_online = list(map(lambda item: item["sum"], db.users.aggregate([
        {"$match": { "user.id": user_id } }, 
        {"$unwind": '$friends'}, 
        {"$group": {
            "_id": "$friends.id", 
            "sum": {"$sum": "$friends.online" }
        }}
    ]))) 
    comments_count = 0
    comment_stickers_count = 0
    comment_text_length = 0
    for comment in db.comments.find({"from_id": user_id}):
        comments_count += 1
        comment_text_length += len(comment["text"] if "text" in comment else 0)
        if "attachments" in comment:
            for attachment in comment["attachments"]:
                if attachment["type"] == "sticker":
                    comment_stickers_count += 1
    comment_text_length = comment_text_length / comments_count if comments_count > 0 else comment_text_length
    comment_stickers_count = comment_stickers_count / comments_count if comments_count > 0 else comment_stickers_count
    friends = record["friends"]
    friend_ages = []
    friend_has_photo = []
    friend_followers_count = []
    for friend in friends:
        if "bdate" in friend:
            try:
                dob = datetime.strptime(friend["bdate"], '%d.%m.%Y')
                today = date.today()
                age = relativedelta(today, dob)
                friend_ages.append(age.years)
            except:
                continue
        friend_has_photo.append(friend["has_photo"] if "has_photo" in friend else 0)
        friend_followers_count.append(friend["followers_count"] if "followers_count" in friend else 0)
    user = record["user"]
    comments_count_list = []
    has_attachments_list = []
    likes_count_list = []
    reposts_count_list = []
    views_count_list = []
    try:
        group_id = -record["subscriptions"][0]['id']
        posts = db.posts.find({"owner_id": group_id})
        for post in posts:
            comments_count_list.append(post["comments"]["count"] if "comments" in post else 0)
            has_attachments_list.append(len(post["attachments"]) > 0 if "attachments" in post else 0)
            likes_count_list.append(post["likes"]["count"] if "likes" in post else 0)
            reposts_count_list.append(post["reposts"]["count"] if "reposts" in post else 0)
            views_count_list.append(post["views"]["count"] if "views" in post else 0)
    except Exception as e:
        print(e)
    row = [
        user_class,
        user["has_photo"],
        user["followers_count"],
        len(record["friends"]),
        online,
        comments_count,
        comment_stickers_count,
        comment_text_length,
        np.mean(friends_online),
        min(friends_online) if len(friends_online) > 0 else None,
        max(friends_online) if len(friends_online) > 0 else None,
        np.mean(friend_ages),
        min(friend_ages) if len(friend_ages) > 0 else None,
        max(friend_ages) if len(friend_ages) > 0 else None,
        np.mean(friend_has_photo),
        min(friend_has_photo) if len(friend_has_photo) > 0 else None,
        max(friend_has_photo) if len(friend_has_photo) > 0 else None,
        np.mean(friend_followers_count),
        min(friend_followers_count) if len(friend_followers_count) > 0 else None,
        max(friend_followers_count) if len(friend_followers_count) > 0 else None,
        1 if "occupation" in user else 0,
        np.mean(comments_count_list),
        min(comments_count_list) if len(comments_count_list) > 0 else None,
        max(comments_count_list) if len(comments_count_list) > 0 else None,
        np.mean(has_attachments_list),
        min(has_attachments_list) if len(has_attachments_list) > 0 else None,
        max(has_attachments_list) if len(has_attachments_list) > 0 else None,
        np.mean(likes_count_list),
        min(likes_count_list) if len(likes_count_list) > 0 else None,
        max(likes_count_list) if len(likes_count_list) > 0 else None,
        np.mean(reposts_count_list),
        min(reposts_count_list) if len(reposts_count_list) > 0 else None,
        max(reposts_count_list) if len(reposts_count_list) > 0 else None,
        np.mean(views_count_list),
        min(views_count_list) if len(views_count_list) > 0 else None,
        max(views_count_list) if len(views_count_list) > 0 else None,
    ]
    return row

rows = []
for user_id, user_class in zip(input_df["id"], input_df["class"]):
    try:
        rows.append(generate_data_for_user(user_id, user_class))
    except:
        continue
df = pd.DataFrame(rows, columns=columns)
df.to_csv("dataset.csv", index=False)

In [None]:
df = pd.read_csv("dataset.csv")
df['class'] = df['class'].map({1: 1, 2: 1, 3: 2, 4: 2, 5: 2})
df_clear = df.fillna(0)
scaler = StandardScaler()
X = scaler.fit_transform(df_clear.drop(columns=["class"]))
y = df_clear["class"].values

pca = PCA(n_components=2)
Xn = pca.fit_transform(X)
df = pd.DataFrame(Xn)
df["class"] = y

young = df[df["class"] == 1]
old = df[df["class"] == 2]

fig = plt.figure(figsize=(12,8))
fig.set(facecolor = 'white')
#ax = fig.add_subplot(projection='3d')
ax = fig.add_subplot()

ax.scatter(young[0], young[1], color="C2", alpha=0.5, label="Младше 25 лет", marker='s')
ax.scatter(old[0], old[1], color="C3", alpha=0.5, label="Старше 25 лет", marker="o")


ax.grid(which='major', linestyle=':', linewidth='0.5', color='black')
ax.set_xlabel("\nПризнак 1", fontname="serif", fontsize=18)
ax.set_ylabel("Признак 2\n", fontname="serif", fontsize=18)

for tick in ax.get_xticklabels():
    tick.set_fontname("serif")
    tick.set_fontsize(18)
for tick in ax.get_yticklabels():
    tick.set_fontname("serif")
    tick.set_fontsize(18)

ax.legend(loc="upper right", prop={'size': 18, "family": "serif"})
plt.show()

In [None]:
df = pd.read_csv("dataset.csv")
# переход к 2 классам (до 24 лет и после), поскольку имеющаяся выборка несбалансирована
df['class'] = df['class'].map({1: 1, 2: 1, 3: 2, 4: 2, 5: 2})
df_clear = df.fillna(0)
scaler = StandardScaler()
le = LabelEncoder()
X = df_clear.drop(columns=["class"])
y = df_clear["class"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.67, random_state=42, stratify=y)


y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

clf = XGBClassifier(n_estimators=100, max_depth=10, random_state=381)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted") 
recall = recall_score(y_test, y_pred, average="weighted") 
f1 = f1_score(y_test, y_pred, average="weighted")

print(accuracy, precision, recall, f1)
filename = 'model.bin'
pickle.dump(clf, open(filename, 'wb'))

In [None]:
df = pd.read_csv("dataset.csv")
# переход к 2 классам (до 24 лет и после), поскольку имеющаяся выборка несбалансирована
df['class'] = df['class'].map({1: 1, 2: 1, 3: 2, 4: 2, 5: 2})
df_clear = df.fillna(0)
scaler = StandardScaler()
le = LabelEncoder()
X = df_clear.drop(columns=["class"])
y = df_clear["class"].values
X = scaler.fit_transform(X)
y = le.fit_transform(y)
kfold = StratifiedKFold(n_splits=4, shuffle=True, random_state=381)
accuracy = cross_val_score(estimator=clf, X=X, y=y, cv=kfold, n_jobs=-1)
precision = cross_val_score(estimator=clf, X=X, y=y, cv=kfold, n_jobs=-1)
recall = cross_val_score(estimator=clf, X=X, y=y, cv=kfold, n_jobs=-1)
f1 = cross_val_score(estimator=clf, X=X, y=y, cv=kfold, n_jobs=-1)
print(accuracy.mean(), precision.mean(), recall.mean(), f1.mean())