# Collect VK Data

In [7]:
!pip install vka

Collecting vk
Installing collected packages: vk
Successfully installed vk-2.0.2


In [1]:
import vk
session = vk.Session(access_token='TOKEN')
vk_api = vk.API(session)

In [5]:
items_ru = []
items_en = []

In [55]:
from tqdm import tqdm
import time
from datetime import datetime


for i in tqdm(range(0, results['count'], 1000)):
    results = vk_api.groups.getMembers(group_id=27895931, sort="id_desc", offset=i, count=1000, v=5.81, fields=['sex'], lang='ru')
    items_ru.append(results['items'])
    results = vk_api.groups.getMembers(group_id=27895931, sort="id_desc",  offset=i, count=1000, v=5.81, fields=['sex'], lang='en')
    items_en.append(results['items'])
    time.sleep(0.8)

# Preprocess VK Data

In [56]:
import pandas as pd

df_ru = pd.read_csv('names_ru.csv')
df_en = pd.read_csv('names_en.csv')

In [58]:
df_ru = df_ru.drop_duplicates(subset=['id'])
df_en = df_en.drop_duplicates(subset=['id'])

In [59]:
df_ru = df_ru.dropna()
df_en = df_en.dropna()

In [60]:
from tqdm import tqdm

first_names = df_ru['first_name'].values
last_names = df_ru['last_name'].values

df_ru['full_name'] = [str(first_names[i]).lower().strip() + ' '+last_names[i].lower().strip() for i in tqdm(range(len(df_ru)))]

100%|██████████| 13131232/13131232 [00:14<00:00, 933523.75it/s]


In [62]:
first_names = df_en['first_name'].values
last_names = df_en['last_name'].values

df_en['full_name'] = [str(first_names[i]).lower().strip() + ' '+last_names[i].lower().strip() 
                      for i in tqdm(range(len(df_en)))]

100%|██████████| 13129236/13129236 [00:12<00:00, 1059306.62it/s]


In [64]:
df = pd.merge(df_ru, df_en, on=['id', 'sex'], how='left', suffixes=['_ru', '_en'])

In [65]:
df = df[df['sex']!=0]
df = df.dropna()

In [66]:
len(df)

13124619

In [69]:
names = []
genders = []

temp = df[df['full_name_en']!=df['full_name_ru']]
names.extend(temp['full_name_en'].values)
genders.extend(temp['sex'].values)
names.extend(temp['full_name_ru'].values)
genders.extend(temp['sex'].values)

temp = df[df['full_name_en']==df['full_name_ru']]
names.extend(temp['full_name_ru'].values)
genders.extend(temp['sex'].values)

In [73]:
len(names)

25101673

# Train Classifier

In [74]:
df_train = pd.DataFrame({'name': names, 'sex': genders})

In [75]:
df_train_sample = df_train.sample(100000)

In [83]:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", LogisticRegression(random_state=0)),
    ]
)

parameters = {
    "vect__analyzer": ['char', 'char_wb'],
    "vect__max_features": (5000, 10000, 20000),
    "vect__max_df": (0.5, 0.75, 1.0),
    "vect__min_df": (0.0, 0.01, 0.05),
    "vect__ngram_range": [(2,3), (2,5),(2, 7)] , 
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ['l1', 'l2'],
}

# Find the best parameters for both the feature extraction and the
# classifier
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=3, scoring='f1_macro')

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
print(parameters)
grid_search.fit(df_train_sample['name'], df_train_sample['sex'])

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'vect__analyzer': ['char', 'char_wb'], 'vect__max_features': (5000, 10000, 20000), 'vect__max_df': (0.5, 0.75, 1.0), 'vect__min_df': (0.0, 0.01, 0.05), 'vect__ngram_range': [(2, 3), (2, 5), (2, 7)], 'tfidf__use_idf': (True, False), 'tfidf__norm': ['l1', 'l2']}
Fitting 3 folds for each of 648 candidates, totalling 1944 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   59.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 17.3min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 26.9min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 38.6min
[Parallel(n_jobs=-1)]: Done 1944 out of 1944 | elapsed: 42.0min finished


Best score: 0.977
Best parameters set:
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__analyzer: 'char_wb'
	vect__max_df: 0.5
	vect__max_features: 20000
	vect__min_df: 0.0
	vect__ngram_range: (2, 7)


In [81]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(names, genders, test_size=0.2, random_state=42)

In [87]:
LogisticRegression(random_state=0, verbose=True)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=True,
                   warm_start=False)

In [98]:
pipeline2 = Pipeline(
    [
        ("vect", CountVectorizer(analyzer='char_wb', max_features=20000, ngram_range=(2,7), max_df=0.5, min_df=0.0)),
        ("tfidf", TfidfTransformer(norm='l2')),
        ("clf", LogisticRegression(random_state=0, verbose=True, n_jobs=-1)),
    ]
)

pipeline2.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 228.9min finished


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='char_wb', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.5,
                                 max_features=20000, min_df=0.0,
                                 ngram_range=(2, 7), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling

In [99]:
predictions = pipeline2.predict(X_test)

In [100]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions, digits=10))

              precision    recall  f1-score   support

           1  0.9868154401 0.9830110438 0.9849095682   2726948
           2  0.9798914883 0.9843833596 0.9821322880   2293387

    accuracy                      0.9836379445   5020335
   macro avg  0.9833534642 0.9836972017 0.9835209281   5020335
weighted avg  0.9836524438 0.9836379445 0.9836408524   5020335



In [103]:
from joblib import dump, load

dump(pipeline2, 'VkGenderLogit.joblib') 

['VkGenderLogit.joblib']

In [104]:
clf = load('VkGenderLogit.joblib') 

In [105]:
predictions2 = clf.predict(X_test)

In [106]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions2, digits=10))

              precision    recall  f1-score   support

           1  0.9868154401 0.9830110438 0.9849095682   2726948
           2  0.9798914883 0.9843833596 0.9821322880   2293387

    accuracy                      0.9836379445   5020335
   macro avg  0.9833534642 0.9836972017 0.9835209281   5020335
weighted avg  0.9836524438 0.9836379445 0.9836408524   5020335

