In [81]:
import warnings
warnings.simplefilter('ignore')

In [82]:
from urllib import request as req

data_dir = '/Users/ssv/newprolab/project01'
filename = 'gender_age_dataset.txt'
file_path = '/'.join([data_dir, filename])
url = 'http://data.newprolab.com/data-newprolab-com/project01/' + filename

# req.urlretrieve(url, file_path)

In [197]:
import pandas as pd
import json
from urllib.parse import urlparse
from datetime import datetime
from sklearn import metrics
from tqdm import tqdm_notebook
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [84]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 4000)

In [85]:
df = pd.read_csv(file_path, sep='\t')

### Оцифровка значений пола и возраста

In [86]:
ages = {'18-24': 1, '25-34': 2, '35-44': 3, '45-54': 4, '>=55': 5}
df["age"] = df["age"].apply(lambda x: ages.get(x, '-'))

In [87]:
genders = {'M': 1, 'F': 0}
df["gender"] = df["gender"].apply(lambda x: genders.get(x, '-'))


### Формирование набора данных с ключевыми словами посещенных сайтов

#### Загружаем из файлов типа df_med_keywords_0000_0999.pickle по мере поступления. Скрапинг сайтов идет очень медленно

In [88]:
import pickle

In [89]:
with open('data/keywords_999.pickle', 'rb') as file:
    df_med_keywords_0000_0999 = pickle.load(file)

In [90]:
with open('data/keywords_2000_10999.pickle', 'rb') as file:
    df_med_keywords_2000_10999 = pickle.load(file)

In [91]:
with open('data/df_med_keywords_non_age.pickle', 'rb') as file:
    df_for_pred = pickle.load(file)

#### Удаляем лишние столбцы в наборе данных после скрапинга

In [92]:
df_med_keywords_0000_0999 = df_med_keywords_0000_0999.drop('url_netloc', axis=1)

In [93]:
df_med_keywords_2000_10999 = df_med_keywords_2000_10999.drop('url_netloc', axis=1)

In [94]:
df_for_pred = df_for_pred.drop(['url_netloc', 'age'], axis=1)

#### Соединяем загруженные таблицы

In [95]:
df_k = pd.merge(df_med_keywords_0000_0999, pd.merge(df_med_keywords_2000_10999, df_for_pred, how='left', on='uid'),
              how='left', on='uid')

#### Сводим ключевые слова в один столбец

In [96]:
df['keywords'] = ''
for index, row in tqdm_notebook(df_k.iterrows()):
    if index < 1000:
        df_k['raw_keywords'].loc[index] = df_k['raw_keywords'].iloc[index]
    if index >= 2000 and index < 11000:
        df_k['raw_keywords'].loc[index] = df_k['raw_keywords_x'].iloc[index]
    if index > 36137:    
        df_k['raw_keywords'].loc[index] = df_k['raw_keywords_y'].iloc[index]
    

A Jupyter Widget




#### Левое последовательное соединение таблиц по uid. Удаляем лишние столбцы после соединения 

In [97]:
df_k = df_k.drop(['raw_keywords_x', 'raw_keywords_y'], axis=1)

In [98]:
df = pd.merge(df, df_k, how='left', on='uid')

In [99]:
df = df.drop(['user_json', 'keywords'], axis=1)

####  В результате - имеем набор df['gender', 'age', 'uid', 'raw_keywords']. Ключевые слова посещенных сайтов заполнены для строк 0 - 999; 2000 - 10999 и таргет: 36138 - 41137. В заполненных строках есть пустые значения (массив длиной 0)

1. Вытащим все ключевые слова для формирования массива стоп-слов
2. Разобьем датасет на два фолда: для тренировки (0000 - 36137) и предсказания (36138 - 41137)
3. Из фолдов удалим строки в которых нет ключевых слов сайтов
4. Сохраняем таргеты и тренируем модель

In [100]:
raw_words = list(df['raw_keywords'].copy())

In [101]:
for item in tqdm_notebook(reversed(raw_words)):
    if len(item) == 0:
        raw_words.remove(item)

A Jupyter Widget




In [102]:
import pymorphy2
m = pymorphy2.MorphAnalyzer()

In [103]:
import re
GROUPING_SPACE_REGEX = re.compile(r'([^\w]|[+])', re.UNICODE)

def simple_word_tokenize(text, _split=GROUPING_SPACE_REGEX.split):
    return [t for t in _split(text.lower()) if t and not t.isspace()]

In [104]:
def token_r(text):
    words = simple_word_tokenize(text)
    return [m.parse(x)[0].normal_form for x in words if len(x) >= 4]

In [105]:
raw_words = [' '.join((x)) for x in raw_words]

In [106]:
clear_words = [' '.join(token_r(x)) for x in tqdm_notebook(raw_words)]

A Jupyter Widget




In [107]:
# with open('data/10999_clear_words.pickle', 'wb') as file:
#     pickle.dump(clear_words, file) 
    
# with open('data/10999_clear_words.pickle', 'rb') as file:
#     clear_words = pickle.load(file)

In [108]:
from sklearn.feature_extraction.text import CountVectorizer

In [109]:
cv = CountVectorizer()
matrix = cv.fit_transform(clear_words)

In [121]:
stop_ww = sorted(cv.vocabulary_.items(), reverse=True)[:950] + sorted(cv.vocabulary_.items())[:1827]
stop_w = [x[0] for x in stop_ww]

In [115]:
# sorted(cv.vocabulary_.items())[1827:]

[('abandon', 1827),
 ('abandoned', 1828),
 ('abandonware', 1829),
 ('abar', 1830),
 ('abart', 1831),
 ('abarth', 1832),
 ('abash', 1833),
 ('abashtube', 1834),
 ('abbey', 1835),
 ('abbigliamento', 1836),
 ('abborfiske', 1837),
 ('abbott', 1838),
 ('abbreviation', 1839),
 ('abbreviations', 1840),
 ('abbyy', 1841),
 ('abcnews', 1842),
 ('abdel', 1843),
 ('abenteuer', 1844),
 ('abercrombie', 1845),
 ('abhidharma', 1846),
 ('abiding', 1847),
 ('abielu', 1848),
 ('ability', 1849),
 ('abinor', 1850),
 ('abireg', 1851),
 ('abit', 1852),
 ('abkhazia', 1853),
 ('able', 1854),
 ('abloud', 1855),
 ('abloy', 1856),
 ('abnehmen', 1857),
 ('abonement', 1858),
 ('abonnement', 1859),
 ('abonnements', 1860),
 ('abonnieren', 1861),
 ('abordables', 1862),
 ('aborigines', 1863),
 ('abortion', 1864),
 ('about', 1865),
 ('aboutpartner', 1866),
 ('above', 1867),
 ('abramovich', 1868),
 ('abrechnung', 1869),
 ('abrechnungsoftware', 1870),
 ('abrechnungssoftware', 1871),
 ('abrechnungssystem', 1872),
 ('abril'

In [120]:
# sorted(cv.vocabulary_.items(), reverse=True)[900:]

[('яшкин', 73999),
 ('яшин', 73998),
 ('ячник', 73997),
 ('ячневый', 73996),
 ('ячневик', 73995),
 ('ячейка', 73994),
 ('ячеистый', 73993),
 ('яценюк', 73992),
 ('яхтсмен', 73991),
 ('яхтинг', 73990),
 ('яхтенный', 73989),
 ('яхта', 73988),
 ('яхрома', 73987),
 ('яхие', 73986),
 ('яхатить', 73985),
 ('яхата', 73984),
 ('яуза', 73983),
 ('ятхи', 73982),
 ('ятх', 73981),
 ('ятоже', 73980),
 ('ятні', 73979),
 ('ятить', 73978),
 ('ятелевизор', 73977),
 ('ясьстроить', 73976),
 ('ясность', 73975),
 ('ясновидящий', 73974),
 ('ясновидение', 73973),
 ('ясно', 73972),
 ('ясли', 73971),
 ('яскраві', 73970),
 ('ясинский', 73969),
 ('ясень', 73968),
 ('ясенецкий', 73967),
 ('ясеневый', 73966),
 ('яс', 73965),
 ('ярык', 73964),
 ('ярый', 73963),
 ('ярцев', 73962),
 ('яртппа', 73961),
 ('ярош', 73960),
 ('ярость', 73959),
 ('яростный', 73958),
 ('ярославский', 73957),
 ('ярославнефтеоргсинтез', 73956),
 ('ярославль', 73955),
 ('ярославец', 73954),
 ('ярослава', 73953),
 ('ярослав', 73952),
 ('яросвет

## Построение модели

1. Подготовка наборов данных
    - Разобьем датасет на два фолда: для тренировки (0000 - 36137) и предсказания (36138 - 41137)
    - Из фолдов удалим строки в которых нет ключевых слов сайтов

In [122]:
# Выделение набора df_train для предсказания (c известными полом и возрастом)
df_train = df.loc[0:36137]
df_train['raw_keywords'] = df_train['raw_keywords'].apply(lambda x: ' '.join(x))
df_train = df_train[df_train['raw_keywords'] != '']

In [123]:
# Выделение набора df_predict для предсказания (без известных пола и возраста)
df_predict = df.loc[36138:]

df_predict_ALL_uid = df['uid'] # все UID до удаления тех, для которых НЕ будет выполняться прогноз !!!!

df_predict['raw_keywords'] = df_predict['raw_keywords'].apply(lambda x: ' '.join(x))
df_predict = df_predict[df_predict['raw_keywords'] != '']

df_predict_uid = df_predict['uid'] # все UID для которых будет выполняться прогноз !!!!

#### Строим матрицу слов по объединенному массиву текстов из наборов df_train и df_predict

In [124]:
ROWS_FOR_FIT = df_train.shape[0]
ROWS_FOR_PREDICT = df_predict.shape[0]

In [125]:
# Объединенный массив текстов
all_texts = pd.concat([df_train['raw_keywords'], df_predict['raw_keywords']], axis=0)

In [126]:
# with open('data/10999_all_texts.pickle', 'wb') as file:
#     pickle.dump(all_texts, file) 
    
# with open('data/10999_all_texts.pickle', 'rb') as file:
#     all_texts = pickle.load(file)

In [35]:
# cv = CountVectorizer(tokenizer=token_r, stop_words=stop_w)
# matrix = cv.fit_transform(all_texts)

In [127]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(tokenizer=token_r, stop_words=stop_w)
matrix = cv.fit_transform(all_texts)

In [237]:
# with open('data/10999_matrix.pickle', 'wb') as file:
#     pickle.dump(matrix, file)     
    
# with open('data/10999_matrix.pickle', 'rb') as file:
#     matrix = pickle.load(file)

In [238]:
MATRIX_FOR_FIT = matrix[:ROWS_FOR_FIT][:]
MATRIX_FOR_PREDICT = matrix[ROWS_FOR_FIT:][:]

In [239]:
target_age = df_train['age'].astype('int32')
target_gender = df_train['gender'].astype('int32')
df_train_uid = df_train['uid']
df_train = df_train.drop(['age', 'gender', 'uid'], axis=1)

KeyError: 'age'

In [240]:
def print_metrics(y_test, pred):
    fpr, tpr, thresholds = metrics.roc_curve(y_test, pred)
    print('AUC             =====> ', metrics.auc(fpr, tpr))
    print('accuracy_score  =====> ', metrics.accuracy_score(y_test, pred))
    print('precision_score =====> ', metrics.precision_score(y_test, pred))
    print('recall_score    =====> ', metrics.recall_score(y_test, pred))

In [241]:
def print_metrics_for_multiclass(y_test, pred):
    print(classification_report(y_test, pred))

In [242]:
from sklearn.cross_validation import train_test_split

### LogisticRegression

In [204]:
X_train, X_test, y_train, y_test = train_test_split(
    MATRIX_FOR_FIT, 
    target_gender, 
    test_size=0.33,
    random_state=40)

In [181]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l1', max_iter=800)
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
print_metrics(y_test, pred)

AUC             =====>  0.679487896982
accuracy_score  =====>  0.68031968032
precision_score =====>  0.687853996224
recall_score    =====>  0.701991008349


In [215]:
X_train, X_test, y_train, y_test = train_test_split(
    MATRIX_FOR_FIT, 
    target_age, 
    test_size=0.33,
    random_state=40)

In [221]:
lr = LogisticRegression(penalty='l1', class_weight='balanced')
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
print_metrics_for_multiclass(y_test, pred)

             precision    recall  f1-score   support

          1       0.25      0.14      0.18       426
          2       0.47      0.67      0.55      1271
          3       0.34      0.15      0.21       752
          4       0.17      0.14      0.15       397
          5       0.16      0.31      0.21       157

avg / total       0.35      0.37      0.34      3003



In [205]:
from sklearn.grid_search import GridSearchCV

In [217]:
c = [10**x for x in range(-10, 16)]

In [218]:
gs = GridSearchCV(
        LogisticRegression(),
        param_grid = { 'penalty' : ['l1','l2'], 'C' : c},
        scoring='roc_auc',
        cv=5,
        verbose=3)

In [None]:
gs.fit(X_train, y_train)

In [209]:
gs.best_estimator_

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [210]:
gs.best_params_

{'C': 1, 'penalty': 'l2'}

In [211]:
gs.best_score_

0.7343916290444319

In [212]:
X_train, X_test, y_train, y_test = train_test_split(
    MATRIX_FOR_FIT, 
    target_gender, 
    test_size=0.33,
    random_state=40)

lr = LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
print_metrics(y_test, pred)

AUC             =====>  0.687047119554
accuracy_score  =====>  0.688311688312
precision_score =====>  0.691076923077
recall_score    =====>  0.721258831085


### RandomForestClassifier

In [194]:
X_train, X_test, y_train, y_test = train_test_split(
    MATRIX_FOR_FIT, 
    target_gender, 
    test_size=0.33,
    random_state=40)

In [195]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=0, n_jobs=-1)
rfc.fit(X_train, y_train) 
pred = rfc.predict(X_test)
print_metrics(y_test, pred)

AUC             =====>  0.666150992573
accuracy_score  =====>  0.667665667666
precision_score =====>  0.670115642118
recall_score    =====>  0.707129094412


In [201]:
X_train, X_test, y_train, y_test = train_test_split(
    MATRIX_FOR_FIT, 
    target_age, 
    test_size=0.33,
    random_state=40)

In [198]:
rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=0, n_jobs=-1)
rfc.fit(X_train, y_train) 
pred = rfc.predict(X_test)
print_metrics_for_multiclass(y_test, pred)

             precision    recall  f1-score   support

          1       0.23      0.07      0.10       426
          2       0.45      0.82      0.58      1271
          3       0.33      0.16      0.22       752
          4       0.21      0.07      0.10       397
          5       0.20      0.06      0.09       157

avg / total       0.34      0.41      0.33      3003



### GradientBoostingClassifier

- Gender

In [199]:
X_train, X_test, y_train, y_test = train_test_split(
    MATRIX_FOR_FIT, 
    target_gender, 
    test_size=0.33,
    random_state=42)

In [200]:
from sklearn.ensemble import GradientBoostingClassifier
rfc = GradientBoostingClassifier(n_estimators=100, random_state=0)
rfc.fit(X_train, y_train) 
pred = rfc.predict(X_test)
print_metrics(y_test, pred)

AUC             =====>  0.669789915966
accuracy_score  =====>  0.668331668332
precision_score =====>  0.701461377871
recall_score    =====>  0.64


- Age

In [202]:
X_train, X_test, y_train, y_test = train_test_split(
    MATRIX_FOR_FIT, 
    target_age, 
    test_size=0.33,
    random_state=42)

In [203]:
from sklearn.ensemble import GradientBoostingClassifier
rfc = GradientBoostingClassifier(n_estimators=100, random_state=0)
rfc.fit(X_train, y_train) 
pred = rfc.predict(X_test)
print_metrics_for_multiclass(y_test, pred)

             precision    recall  f1-score   support

          1       0.20      0.03      0.05       428
          2       0.43      0.89      0.58      1228
          3       0.33      0.12      0.18       761
          4       0.16      0.03      0.05       427
          5       0.16      0.03      0.05       159

avg / total       0.32      0.40      0.30      3003



### Обучаем на всем наборе

- Gender

In [254]:
rfc_gender = GradientBoostingClassifier(n_estimators=100)
rfc_gender.fit(MATRIX_FOR_FIT, target_gender) 

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

- Age

In [255]:
rfc_age = GradientBoostingClassifier(n_estimators=100)
rfc_age.fit(MATRIX_FOR_FIT, target_age) 

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

### PREDICT

- Gender

In [256]:
pred_gender = rfc_gender.predict(MATRIX_FOR_PREDICT)

- Age

In [257]:
pred_age = rfc_age.predict(MATRIX_FOR_PREDICT)

### Формируем результат

In [258]:
pre_result = pd.DataFrame()
pre_result['uid'] = df_predict_uid
pre_result['gender'] = pred_gender
pre_result['age'] = pred_age

#### обратная перекодировка значений

In [259]:
genders = {1 : 'M', 0 : 'F'}
pre_result["gender"] = pre_result["gender"].apply(lambda x: genders.get(x))

In [260]:
ages = {1 :'18-24', 2 :'25-34', 3 : '35-44', 4 : '45-54', 5 :'>=55'}
pre_result["age"] = pre_result["age"].apply(lambda x: ages.get(x))

#### слияние с полным списком uid

In [261]:
result = pd.DataFrame()
result['uid'] = df_predict_ALL_uid[36138:]
result = pd.merge(result, pre_result, how='left', on='uid')
result = result.fillna('-')

In [262]:
sorted_res = result.sort_values(by='uid')

In [263]:
sorted_res.to_csv('project01_gender-age.csv', sep='\t', index=False)

In [264]:
sorted_res.head()

Unnamed: 0,uid,gender,age
1675,0000e7ca-32e6-4bef-bdca-e21c025071ff,F,25-34
1676,0000f3cf-6e9a-4eab-92f4-cefdad108c83,F,25-34
1677,000381a6-0400-40f8-98c0-93a6c0852d2d,M,35-44
1678,00062d89-a3ba-40dd-9a25-212c9f429133,F,>=55
1679,0009233e-a0c2-438e-ac36-bd3e1b7dad1e,-,-
