In [13]:
import pandas as pd
import json
import pickle
from datetime import datetime
from sklearn import metrics
from tqdm import tqdm_notebook
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
import warnings
warnings.simplefilter('ignore')

In [3]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 4000)

In [10]:
data_dir = '/Users/ssv/newprolab/project01/data'
filename = 'gender_age_dataset.txt'
file_path = '/'.join([data_dir, filename])
df = pd.read_csv(file_path, sep='\t')

### Оцифровка значений пола и возраста

In [11]:
ages = {'18-24': 1, '25-34': 2, '35-44': 3, '45-54': 4, '>=55': 5}
df["age"] = df["age"].apply(lambda x: ages.get(x, '-'))

In [12]:
genders = {'M': 1, 'F': 0}
df["gender"] = df["gender"].apply(lambda x: genders.get(x, '-'))


### Формирование набора данных с ключевыми словами посещенных сайтов

#### Загружаем из файлов типа df_med_keywords_0000_0999.pickle по мере поступления. Скрапинг сайтов идет очень медленно

In [14]:
with open('data/keywords_999.pickle', 'rb') as file:
    df_med_keywords_0000_0999 = pickle.load(file)

with open('data/keywords_2000_10999.pickle', 'rb') as file:
    df_med_keywords_2000_10999 = pickle.load(file)

with open('data/df_med_keywords_non_age.pickle', 'rb') as file:
    df_for_pred = pickle.load(file)

#### Удаляем лишние столбцы в наборе данных после скрапинга

In [17]:
df_med_keywords_0000_0999 = df_med_keywords_0000_0999.drop('url_netloc', axis=1)
df_med_keywords_2000_10999 = df_med_keywords_2000_10999.drop('url_netloc', axis=1)
df_for_pred = df_for_pred.drop(['url_netloc', 'age'], axis=1)

#### Соединяем загруженные таблицы

In [18]:
df_k = pd.merge(df_med_keywords_0000_0999, 
                pd.merge(df_med_keywords_2000_10999, df_for_pred, how='left', on='uid'),
                how='left', on='uid')

#### Сводим ключевые слова в один столбец

In [19]:
df['keywords'] = ''
for index, row in tqdm_notebook(df_k.iterrows()):
    if index < 1000:
        df_k['raw_keywords'].loc[index] = df_k['raw_keywords'].iloc[index]
    if index >= 2000 and index < 11000:
        df_k['raw_keywords'].loc[index] = df_k['raw_keywords_x'].iloc[index]
    if index > 36137:    
        df_k['raw_keywords'].loc[index] = df_k['raw_keywords_y'].iloc[index]
    

A Jupyter Widget




#### Левое последовательное соединение таблиц по uid. Удаляем лишние столбцы после соединения 

In [20]:
df_k = df_k.drop(['raw_keywords_x', 'raw_keywords_y'], axis=1)
df = pd.merge(df, df_k, how='left', on='uid')
df = df.drop(['user_json', 'keywords'], axis=1)

####  В результате - имеем набор df['gender', 'age', 'uid', 'raw_keywords']. Ключевые слова посещенных сайтов заполнены для строк 0 - 999; 2000 - 10999 и таргет: 36138 - 41137. В заполненных строках есть пустые значения (массив длиной 0)

1. Вытащим все ключевые слова для формирования массива стоп-слов
2. Разобьем датасет на два фолда: для тренировки (0000 - 36137) и предсказания (36138 - 41137)
3. Из фолдов удалим строки в которых нет ключевых слов сайтов
4. Сохраняем таргеты и тренируем модель

In [21]:
raw_words = list(df['raw_keywords'].copy())

In [22]:
for item in tqdm_notebook(reversed(raw_words)):
    if len(item) == 0:
        raw_words.remove(item)

A Jupyter Widget




In [23]:
import pymorphy2
m = pymorphy2.MorphAnalyzer()

In [24]:
import re
GROUPING_SPACE_REGEX = re.compile(r'([^\w]|[+])', re.UNICODE)

def simple_word_tokenize(text, _split=GROUPING_SPACE_REGEX.split):
    return [t for t in _split(text.lower()) if t and not t.isspace()]

In [25]:
def token_r(text):
    words = simple_word_tokenize(text)
    return [m.parse(x)[0].normal_form for x in words if len(x) >= 4]

In [26]:
raw_words = [' '.join((x)) for x in raw_words]

In [27]:
clear_words = [' '.join(token_r(x)) for x in tqdm_notebook(raw_words)]

A Jupyter Widget




In [28]:
# ---------------------------------------------------------
# with open('data/10999_clear_words.pickle', 'wb') as file:
#     pickle.dump(clear_words, file) 
# ---------------------------------------------------------    
# with open('data/10999_clear_words.pickle', 'rb') as file:
#     clear_words = pickle.load(file)
# ---------------------------------------------------------

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

In [30]:
cv = CountVectorizer()
matrix = cv.fit_transform(clear_words)

In [31]:
stop_ww = sorted(cv.vocabulary_.items(), reverse=True)[:950] + sorted(cv.vocabulary_.items())[:1827]
stop_w = [x[0] for x in stop_ww]

In [43]:
# ---------------------------------------------------------
# with open('data/stop_w.pickle', 'wb') as file:
#     pickle.dump(stop_w, file) 
# ---------------------------------------------------------    
# with open('data/stop_w.pickle', 'rb') as file:
#     stop_w = pickle.load(file)

In [32]:
# sorted(cv.vocabulary_.items())[1827:]
# sorted(cv.vocabulary_.items(), reverse=True)[900:]

## Построение модели

1. Подготовка наборов данных
    - Разобьем датасет на два фолда: для тренировки (0000 - 36137) и предсказания (36138 - 41137)
    - Из фолдов удалим строки в которых нет ключевых слов сайтов

In [33]:
# Выделение набора df_train для предсказания (c известными полом и возрастом)
df_train = df.loc[0:36137]
df_train['raw_keywords'] = df_train['raw_keywords'].apply(lambda x: ' '.join(x))
df_train = df_train[df_train['raw_keywords'] != '']

In [34]:
# Выделение набора df_predict для предсказания (без известных пола и возраста)
df_predict = df.loc[36138:]
df_predict_ALL_uid = df['uid']         # все UID до удаления тех, для которых НЕ будет выполняться прогноз !!!!
df_predict['raw_keywords'] = df_predict['raw_keywords'].apply(lambda x: ' '.join(x))
df_predict = df_predict[df_predict['raw_keywords'] != '']
df_predict_uid = df_predict['uid']      # все UID для которых будет выполняться прогноз !!!!

In [35]:
# TARGETS
target_age = df_train['age'].astype('int32')
target_gender = df_train['gender'].astype('int32')
df_train_uid = df_train['uid']
df_train = df_train.drop(['age', 'gender', 'uid'], axis=1)

In [40]:
TARGETS_FOR_SAVE = [target_age, target_gender, df_train_uid, df_predict_ALL_uid]

In [39]:
# with open('data/df_train.pickle', 'wb') as file:
#     pickle.dump(df_train, file) 
# with open('data/df_predict.pickle', 'wb') as file:
#     pickle.dump(df_predict, file) 
# with open('data/TARGETS_FOR_SAVE.pickle', 'wb') as file:
#     pickle.dump(TARGETS_FOR_SAVE, file) 

#### Строим матрицу слов по объединенному массиву текстов из наборов df_train и df_predict

In [45]:
ROWS_FOR_FIT = df_train.shape[0]
ROWS_FOR_PREDICT = df_predict.shape[0]

In [46]:
# Объединенный массив текстов
all_texts = pd.concat([df_train['raw_keywords'], df_predict['raw_keywords']], axis=0)

In [177]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(tokenizer=token_r, stop_words=stop_w)
matrix = cv.fit_transform(all_texts)

In [49]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# cv = TfidfVectorizer(tokenizer=token_r, stop_words=stop_w)
# matrix = cv.fit_transform(all_texts)

In [178]:
# with open('data/10999_matrix.pickle', 'wb') as file:
#     pickle.dump(matrix, file)     
    
# with open('data/10999_matrix.pickle', 'rb') as file:
#     matrix = pickle.load(file)

In [44]:
# READ DATA 1
# ---------------------------------------------------------    
# with open('data/stop_w.pickle', 'rb') as file:
#     stop_w = pickle.load(file)
# ---------------------------------------------------------
# with open('data/df_train.pickle', 'rb') as file:
#     df_train = pickle.load(file)
# ---------------------------------------------------------
# with open('data/df_predict.pickle', 'rb') as file:
#     df_predict = pickle.load(file)
# ---------------------------------------------------------
# with open('data/TARGETS_FOR_SAVE.pickle', 'rb') as file:
#     TARGETS_FOR_SAVE = pickle.load(file)
# target_age         = TARGETS_FOR_SAVE[0]
# target_gender      = TARGETS_FOR_SAVE[1]
# df_train_uid       = TARGETS_FOR_SAVE[2]
# df_predict_ALL_uid = TARGETS_FOR_SAVE[3]
# ---------------------------------------------------------
# with open('data/10999_matrix.pickle', 'rb') as file:
#     matrix = pickle.load(file)

In [179]:
MATRIX_FOR_FIT = matrix[:ROWS_FOR_FIT][:]
MATRIX_FOR_PREDICT = matrix[ROWS_FOR_FIT:][:]

In [52]:
def print_metrics(y_test, pred):
    fpr, tpr, thresholds = metrics.roc_curve(y_test, pred)
    print('AUC             =====> ', metrics.auc(fpr, tpr))
    print('accuracy_score  =====> ', metrics.accuracy_score(y_test, pred))
    print('precision_score =====> ', metrics.precision_score(y_test, pred))
    print('recall_score    =====> ', metrics.recall_score(y_test, pred))

In [53]:
def print_metrics_for_multiclass(y_test, pred):
    print(classification_report(y_test, pred))

In [54]:
from sklearn.cross_validation import train_test_split

### LogisticRegression

In [180]:
X_train, X_test, y_train, y_test = train_test_split(
    MATRIX_FOR_FIT, 
    target_gender, 
    test_size=0.33,
    random_state=40)

In [187]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l2', C=0.001)
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
print_metrics(y_test, pred)

AUC             =====>  0.672177850265
accuracy_score  =====>  0.67365967366
precision_score =====>  0.675807434491
recall_score    =====>  0.712267180475


In [188]:
print('TRAIN dataset accuracy {:.5f}'.format(lr.score(X_train, y_train)))
print('TEST dataset accuracy  {:.5f}'.format(lr.score(X_test, y_test)))

TRAIN dataset accuracy 0.72518
TEST dataset accuracy  0.67366


In [190]:
X_train, X_test, y_train, y_test = train_test_split(
    MATRIX_FOR_FIT, 
    target_age, 
    test_size=0.33,
    random_state=40)

In [193]:
lr = LogisticRegression(penalty='l2', class_weight='balanced', C=0.001)
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
print_metrics_for_multiclass(y_test, pred)

             precision    recall  f1-score   support

          1       0.23      0.10      0.14       426
          2       0.47      0.64      0.55      1271
          3       0.33      0.24      0.28       752
          4       0.20      0.17      0.18       397
          5       0.15      0.21      0.17       157

avg / total       0.35      0.38      0.35      3003



In [194]:
print('TRAIN dataset accuracy {:.5f}'.format(lr.score(X_train, y_train)))
print('TEST dataset accuracy  {:.5f}'.format(lr.score(X_test, y_test)))

TRAIN dataset accuracy 0.52272
TEST dataset accuracy  0.37829


## Линейная регрессия

In [195]:
X_train, X_test, y_train, y_test = train_test_split(
    MATRIX_FOR_FIT, 
    target_gender, 
    test_size=0.33,
    random_state=40)

In [196]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred)
print('AUC             =====> ', metrics.auc(fpr, tpr))

AUC             =====>  0.553714496882


In [197]:
print('TRAIN dataset accuracy {:.5f}'.format(lr.score(X_train, y_train)))
print('TEST dataset accuracy  {:.5f}'.format(lr.score(X_test, y_test)))

TRAIN dataset accuracy 0.83015
TEST dataset accuracy  -42.18202


In [198]:
X_train, X_test, y_train, y_test = train_test_split(
    MATRIX_FOR_FIT, 
    target_age, 
    test_size=0.33,
    random_state=40)

In [199]:
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)

In [200]:
print('TRAIN dataset accuracy {:.5f}'.format(lr.score(X_train, y_train)))
print('TEST dataset accuracy  {:.5f}'.format(lr.score(X_test, y_test)))

TRAIN dataset accuracy 0.82704
TEST dataset accuracy  -39.58727


## Байес

In [201]:
X_train, X_test, y_train, y_test = train_test_split(
    MATRIX_FOR_FIT, 
    target_gender, 
    test_size=0.33,
    random_state=40)

In [202]:
from sklearn.naive_bayes import MultinomialNB
lr = MultinomialNB(alpha=0.1)
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
print_metrics(y_test, pred)

AUC             =====>  0.682276801062
accuracy_score  =====>  0.681984681985
precision_score =====>  0.700934579439
recall_score    =====>  0.674373795761


In [203]:
print('TRAIN dataset accuracy {:.5f}'.format(lr.score(X_train, y_train)))
print('TEST  dataset accuracy {:.5f}'.format(lr.score(X_test, y_test)))

TRAIN dataset accuracy 0.77293
TEST  dataset accuracy 0.68198


In [204]:
X_train, X_test, y_train, y_test = train_test_split(
    MATRIX_FOR_FIT, 
    target_age, 
    test_size=0.33,
    random_state=40)

In [205]:
lr = MultinomialNB(alpha=0.01)
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
print_metrics_for_multiclass(y_test, pred)

             precision    recall  f1-score   support

          1       0.20      0.26      0.22       426
          2       0.46      0.33      0.38      1271
          3       0.30      0.26      0.27       752
          4       0.16      0.17      0.16       397
          5       0.11      0.33      0.17       157

avg / total       0.32      0.28      0.29      3003



In [206]:
print('TRAIN dataset accuracy {:.5f}'.format(lr.score(X_train, y_train)))
print('TEST  dataset accuracy {:.5f}'.format(lr.score(X_test, y_test)))

TRAIN dataset accuracy 0.60574
TEST  dataset accuracy 0.27905


## Кросс-валидация

In [205]:
from sklearn.grid_search import GridSearchCV

In [217]:
c = [10**x for x in range(-10, 16)]

In [218]:
gs = GridSearchCV(
        LogisticRegression(),
        param_grid = { 'penalty' : ['l1','l2'], 'C' : c},
        scoring='roc_auc',
        cv=5,
        verbose=3)

In [None]:
gs.fit(X_train, y_train)

In [209]:
gs.best_estimator_

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [210]:
gs.best_params_

{'C': 1, 'penalty': 'l2'}

In [211]:
gs.best_score_

0.7343916290444319

In [212]:
X_train, X_test, y_train, y_test = train_test_split(
    MATRIX_FOR_FIT, 
    target_gender, 
    test_size=0.33,
    random_state=40)

lr = LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
print_metrics(y_test, pred)

AUC             =====>  0.687047119554
accuracy_score  =====>  0.688311688312
precision_score =====>  0.691076923077
recall_score    =====>  0.721258831085


### RandomForestClassifier

In [207]:
X_train, X_test, y_train, y_test = train_test_split(
    MATRIX_FOR_FIT, 
    target_gender, 
    test_size=0.33,
    random_state=40)

In [208]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, max_depth=4, class_weight='balanced', random_state=0, n_jobs=-1)
rfc.fit(X_train, y_train) 
pred = rfc.predict(X_test)
print_metrics(y_test, pred)

AUC             =====>  0.65563963575
accuracy_score  =====>  0.65034965035
precision_score =====>  0.732782369146
recall_score    =====>  0.512524084778


In [209]:
print('TRAIN dataset accuracy {:.5f}'.format(lr.score(X_train, y_train)))
print('TEST  dataset accuracy {:.5f}'.format(lr.score(X_test, y_test)))

TRAIN dataset accuracy 0.09582
TEST  dataset accuracy 0.09324


In [210]:
X_train, X_test, y_train, y_test = train_test_split(
    MATRIX_FOR_FIT, 
    target_age, 
    test_size=0.33,
    random_state=40)

In [213]:
rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=0, n_jobs=-1)
rfc.fit(X_train, y_train) 
pred = rfc.predict(X_test)
print_metrics_for_multiclass(y_test, pred)

             precision    recall  f1-score   support

          1       0.19      0.08      0.12       426
          2       0.45      0.75      0.56      1271
          3       0.28      0.16      0.20       752
          4       0.20      0.10      0.14       397
          5       0.16      0.05      0.08       157

avg / total       0.32      0.39      0.33      3003



In [214]:
print('TRAIN dataset accuracy {:.5f}'.format(lr.score(X_train, y_train)))
print('TEST  dataset accuracy {:.5f}'.format(lr.score(X_test, y_test)))

TRAIN dataset accuracy 0.60574
TEST  dataset accuracy 0.27905


### GradientBoostingClassifier

- Gender

In [215]:
X_train, X_test, y_train, y_test = train_test_split(
    MATRIX_FOR_FIT, 
    target_gender, 
    test_size=0.33,
    random_state=42)

In [220]:
from sklearn.ensemble import GradientBoostingClassifier
rfc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.01, random_state=0)
rfc.fit(X_train, y_train) 
pred = rfc.predict(X_test)
print_metrics(y_test, pred)

AUC             =====>  0.498510737628
accuracy_score  =====>  0.522810522811
precision_score =====>  0.523729946524
recall_score    =====>  0.994920634921


In [221]:
print('TRAIN dataset accuracy {:.5f}'.format(lr.score(X_train, y_train)))
print('TEST  dataset accuracy {:.5f}'.format(lr.score(X_test, y_test)))



TRAIN dataset accuracy 0.09582
TEST  dataset accuracy 0.09324


In [222]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rfc, X_train, y_train)
print(scores.mean())   
scores = cross_val_score(rfc, X_test, y_test)
print(scores.mean())

0.538970033993
0.536463536464


In [None]:
0.6767856867512404

In [219]:
scores = cross_val_score(rfc, X_test, y_test)
print(scores.mean())

0.650016650017


In [None]:
0.650016650017

- Age

In [202]:
X_train, X_test, y_train, y_test = train_test_split(
    MATRIX_FOR_FIT, 
    target_age, 
    test_size=0.33,
    random_state=42)

In [203]:
from sklearn.ensemble import GradientBoostingClassifier
rfc = GradientBoostingClassifier(n_estimators=100, random_state=0)
rfc.fit(X_train, y_train) 
pred = rfc.predict(X_test)
print_metrics_for_multiclass(y_test, pred)

             precision    recall  f1-score   support

          1       0.20      0.03      0.05       428
          2       0.43      0.89      0.58      1228
          3       0.33      0.12      0.18       761
          4       0.16      0.03      0.05       427
          5       0.16      0.03      0.05       159

avg / total       0.32      0.40      0.30      3003



### Обучаем на всем наборе

- Gender

In [254]:
rfc_gender = GradientBoostingClassifier(n_estimators=100)
rfc_gender.fit(MATRIX_FOR_FIT, target_gender) 

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

- Age

In [255]:
rfc_age = GradientBoostingClassifier(n_estimators=100)
rfc_age.fit(MATRIX_FOR_FIT, target_age) 

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

### PREDICT

- Gender

In [256]:
pred_gender = rfc_gender.predict(MATRIX_FOR_PREDICT)

- Age

In [257]:
pred_age = rfc_age.predict(MATRIX_FOR_PREDICT)

### Формируем результат

In [258]:
pre_result = pd.DataFrame()
pre_result['uid'] = df_predict_uid
pre_result['gender'] = pred_gender
pre_result['age'] = pred_age

#### обратная перекодировка значений

In [259]:
genders = {1 : 'M', 0 : 'F'}
pre_result["gender"] = pre_result["gender"].apply(lambda x: genders.get(x))

In [260]:
ages = {1 :'18-24', 2 :'25-34', 3 : '35-44', 4 : '45-54', 5 :'>=55'}
pre_result["age"] = pre_result["age"].apply(lambda x: ages.get(x))

#### слияние с полным списком uid

In [261]:
result = pd.DataFrame()
result['uid'] = df_predict_ALL_uid[36138:]
result = pd.merge(result, pre_result, how='left', on='uid')
result = result.fillna('-')

In [262]:
sorted_res = result.sort_values(by='uid')

In [263]:
sorted_res.to_csv('project01_gender-age.csv', sep='\t', index=False)

In [264]:
sorted_res.head()

Unnamed: 0,uid,gender,age
1675,0000e7ca-32e6-4bef-bdca-e21c025071ff,F,25-34
1676,0000f3cf-6e9a-4eab-92f4-cefdad108c83,F,25-34
1677,000381a6-0400-40f8-98c0-93a6c0852d2d,M,35-44
1678,00062d89-a3ba-40dd-9a25-212c9f429133,F,>=55
1679,0009233e-a0c2-438e-ac36-bd3e1b7dad1e,-,-
