In [1]:
import warnings
warnings.simplefilter('ignore')

In [2]:
from urllib import request as req

data_dir = '/Users/ssv/newprolab/project01'
filename = 'gender_age_dataset.txt'
file_path = '/'.join([data_dir, filename])
url = 'http://data.newprolab.com/data-newprolab-com/project01/' + filename

# req.urlretrieve(url, file_path)

In [181]:
import pandas as pd
import json
from urllib.parse import urlparse
from datetime import datetime
from sklearn import metrics
from tqdm import tqdm_notebook
from sklearn.metrics import classification_report

In [4]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 4000)

In [5]:
df = pd.read_csv(file_path, sep='\t')

### Оцифровка значений пола и возраста

In [6]:
ages = {'18-24': 1, '25-34': 2, '35-44': 3, '45-54': 4, '>=55': 5}
df["age"] = df["age"].apply(lambda x: ages.get(x, '-'))

In [7]:
genders = {'M': 1, 'F': 0}
df["gender"] = df["gender"].apply(lambda x: genders.get(x, '-'))


### Формирование набора данных с ключевыми словами посещенных сайтов

#### Загружаем из файлов типа df_med_keywords_0000_0999.pickle по мере поступления. Скрапинг сайтов идет очень медленно

In [8]:
import pickle

In [9]:
with open('data/keywords_999.pickle', 'rb') as file:
    df_med_keywords_0000_0999 = pickle.load(file)

In [10]:
with open('data/keywords_2000_6999.pickle', 'rb') as file:
    df_med_keywords_2000_6999 = pickle.load(file)

In [11]:
with open('data/df_med_keywords_non_age.pickle', 'rb') as file:
    df_for_pred = pickle.load(file)

#### Удаляем лишние столбцы в наборе данных после скрапинга

In [12]:
df_med_keywords_0000_0999 = df_med_keywords_0000_0999.drop('url_netloc', axis=1)

In [13]:
df_med_keywords_2000_6999 = df_med_keywords_2000_6999.drop('url_netloc', axis=1)

In [14]:
df_for_pred = df_for_pred.drop(['url_netloc', 'age'], axis=1)

#### Соединяем загруженные таблицы

In [15]:
df_k = pd.merge(df_med_keywords_0000_0999, pd.merge(df_med_keywords_2000_6999, df_for_pred, how='left', on='uid'),
              how='left', on='uid')

#### Сводим ключевые слова в один столбец

In [16]:
df['keywords'] = ''
for index, row in tqdm_notebook(df_k.iterrows()):
    if index < 1000:
        df_k['raw_keywords'].loc[index] = df_k['raw_keywords'].iloc[index]
    if index >= 2000 and index < 7000:
        df_k['raw_keywords'].loc[index] = df_k['raw_keywords_x'].iloc[index]
    if index > 36137:    
        df_k['raw_keywords'].loc[index] = df_k['raw_keywords_y'].iloc[index]
    

A Jupyter Widget




#### Левое последовательное соединение таблиц по uid. Удаляем лишние столбцы после соединения 

In [17]:
df_k = df_k.drop(['raw_keywords_x', 'raw_keywords_y'], axis=1)

In [18]:
df = pd.merge(df, df_k, how='left', on='uid')

In [19]:
df = df.drop(['user_json', 'keywords'], axis=1)

####  В результате - имеем набор df['gender', 'age', 'uid', 'raw_keywords']. Ключевые слова посещенных сайтов заполнены для строк 0 - 999; 2000 - 6999 и таргет: 36138 - 41137. В заполненных строках есть пустые значения (массив длиной 0)

1. Вытащим все ключевые слова для формирования массива стоп-слов
2. Разобьем датасет на два фолда: для тренировки (0000 - 36137) и предсказания (36138 - 41137)
3. Из фолдов удалим строки в которых нет ключевых слов сайтов
4. Сохраняем таргеты и тренируем модель

In [20]:
raw_words = list(df['raw_keywords'].copy())

In [21]:
for item in tqdm_notebook(reversed(raw_words)):
    if len(item) == 0:
        raw_words.remove(item)

A Jupyter Widget




In [22]:
import pymorphy2
m = pymorphy2.MorphAnalyzer()

In [23]:
import re
GROUPING_SPACE_REGEX = re.compile(r'([^\w]|[+])', re.UNICODE)

def simple_word_tokenize(text, _split=GROUPING_SPACE_REGEX.split):
    return [t for t in _split(text.lower()) if t and not t.isspace()]

In [24]:
def token_r(text):
    words = simple_word_tokenize(text)
    return [m.parse(x)[0].normal_form for x in words if len(x) >= 4]

In [25]:
raw_words = [' '.join((x)) for x in raw_words]
clear_words = [' '.join(token_r(x)) for x in raw_words]

In [27]:
# with open('data/clear_words.pickle', 'wb') as file:
#     pickle.dump(clear_words, file) 
    
# with open('data/clear_words.pickle', 'rb') as file:
#     clear_words = pickle.load(file)

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
cv = CountVectorizer()
matrix = cv.fit_transform(clear_words)
stop_ww = sorted(cv.vocabulary_.items(), reverse=True)[:552] + sorted(cv.vocabulary_.items())[:1451]
stop_w = [x[0] for x in stop_ww]

In [None]:
# sorted(cv.vocabulary_.items())[1451:]

In [None]:
# sorted(cv.vocabulary_.items(), reverse=True)[552:]

## Построение модели

1. Подготовка наборов данных
    - Разобьем датасет на два фолда: для тренировки (0000 - 36137) и предсказания (36138 - 41137)
    - Из фолдов удалим строки в которых нет ключевых слов сайтов

In [30]:
# Выделение набора df_train для предсказания (c известными полом и возрастом)
df_train = df.loc[0:36137]
df_train['raw_keywords'] = df_train['raw_keywords'].apply(lambda x: ' '.join(x))
df_train = df_train[df_train['raw_keywords'] != '']

In [31]:
# Выделение набора df_predict для предсказания (без известных пола и возраста)
df_predict = df.loc[36138:]

# -- ??????????????????????????
df_predict_ALL_uid = df['uid'] # все UID до удаления тех, для которых НЕ будет выполняться прогноз !!!!

df_predict['raw_keywords'] = df_predict['raw_keywords'].apply(lambda x: ' '.join(x))
df_predict = df_predict[df_predict['raw_keywords'] != '']

df_predict_uid = df_predict['uid'] # все UID для которых будет выполняться прогноз !!!!

#### Строим матрицу слов по объединенному массиву текстов из наборов df_train и df_predict

In [50]:
ROWS_FOR_FIT = df_train.shape[0]
ROWS_FOR_PREDICT = df_predict.shape[0]

In [37]:
# Объединенный массив текстов
all_texts = pd.concat([df_train['raw_keywords'], df_predict['raw_keywords']], axis=0)

In [40]:
# with open('data/all_texts.pickle', 'wb') as file:
#     pickle.dump(all_texts, file) 
    
# with open('data/all_texts.pickle', 'rb') as file:
#     all_texts = pickle.load(file)

In [41]:
cv = CountVectorizer(tokenizer=token_r, stop_words=stop_w)
matrix = cv.fit_transform(all_texts)

## TF-IDF:

In [169]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(tokenizer=token_r, stop_words=stop_w)
matrix = cv.fit_transform(all_texts)

In [42]:
# with open('data/matrix.pickle', 'wb') as file:
#     pickle.dump(matrix, file)     
    
# with open('data/matrix.pickle', 'rb') as file:
#     matrix = pickle.load(file)

In [170]:
MATRIX_FOR_FIT = matrix[:ROWS_FOR_FIT][:]
MATRIX_FOR_PREDICT = matrix[ROWS_FOR_FIT:][:]

In [171]:
target_age = df_train['age'].astype('int32')
target_gender = df_train['gender'].astype('int32')
df_train_uid = df_train['uid']
df_train = df_train.drop(['age', 'gender', 'uid'], axis=1)

KeyError: 'age'

In [172]:
from sklearn.cross_validation import train_test_split

In [173]:
X_train, X_test, y_train, y_test = train_test_split(
    MATRIX_FOR_FIT, 
    target_gender, 
    test_size=0.33,
    random_state=42)

In [174]:
def print_metrics(y_test, pred):
    fpr, tpr, thresholds = metrics.roc_curve(y_test, pred)
    print('AUC             =====> ', metrics.auc(fpr, tpr))
    print('accuracy_score  =====> ', metrics.accuracy_score(y_test, pred))
    print('precision_score =====> ', metrics.precision_score(y_test, pred))
    print('recall_score    =====> ', metrics.recall_score(y_test, pred))

In [182]:
def print_metrics_for_multiclass(y_test, pred):
    print(classification_report(y_test, pred))

### LogisticRegression

In [175]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l1')
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
print_metrics(y_test, pred)

AUC             =====>  0.674180327869
accuracy_score  =====>  0.674651810585
precision_score =====>  0.674762407603
recall_score    =====>  0.698360655738


### RandomForestClassifier

In [176]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=0, n_jobs=-1)
rfc.fit(X_train, y_train) 
pred = rfc.predict(X_test)
print_metrics(y_test, pred)

AUC             =====>  0.636832463984
accuracy_score  =====>  0.637883008357
precision_score =====>  0.632632632633
recall_score    =====>  0.690710382514


### GradientBoostingClassifier

- Gender

In [188]:
X_train, X_test, y_train, y_test = train_test_split(
    MATRIX_FOR_FIT, 
    target_gender, 
    test_size=0.33,
    random_state=42)

In [178]:
from sklearn.ensemble import GradientBoostingClassifier
rfc = GradientBoostingClassifier(n_estimators=500, random_state=0)
rfc.fit(X_train, y_train) 
pred = rfc.predict(X_test)
print_metrics(y_test, pred)

AUC             =====>  0.650313586687
accuracy_score  =====>  0.65069637883
precision_score =====>  0.653518123667
recall_score    =====>  0.669945355191


- Age

In [183]:
X_train, X_test, y_train, y_test = train_test_split(
    MATRIX_FOR_FIT, 
    target_age, 
    test_size=0.33,
    random_state=42)

In [184]:
from sklearn.ensemble import GradientBoostingClassifier
rfc = GradientBoostingClassifier(n_estimators=500, random_state=0)
rfc.fit(X_train, y_train) 
pred = rfc.predict(X_test)
print_metrics_for_multiclass(y_test, pred)

             precision    recall  f1-score   support

          1       0.15      0.06      0.09       241
          2       0.42      0.70      0.52       738
          3       0.31      0.22      0.26       472
          4       0.13      0.05      0.07       261
          5       0.13      0.05      0.07        83

avg / total       0.30      0.36      0.31      1795



### Обучаем на всем наборе

- Gender

In [158]:
rfc_gender = GradientBoostingClassifier(n_estimators=100, random_state=0)
rfc_gender.fit(MATRIX_FOR_FIT, target_gender) 

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1500,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False)

- Age

In [None]:
rfc_age = GradientBoostingClassifier(n_estimators=100, random_state=0)
rfc_age.fit(MATRIX_FOR_FIT, target_age) 

### PREDICT

- Gender

In [160]:
pred_gender = rfc_gender.predict(MATRIX_FOR_PREDICT)

- Age

In [161]:
pred_age = rfc_age.predict(MATRIX_FOR_PREDICT)

### Формируем результат

In [162]:
pre_result = pd.DataFrame()
pre_result['uid'] = df_predict_uid
pre_result['gender'] = pred_gender
pre_result['age'] = pred_age

#### обратная перекодировка значений

In [163]:
genders = {1 : 'M', 0 : 'F'}
pre_result["gender"] = pre_result["gender"].apply(lambda x: genders.get(x))

In [164]:
ages = {1 :'18-24', 2 :'25-34', 3 : '35-44', 4 : '45-54', 5 :'>=55'}
pre_result["age"] = pre_result["age"].apply(lambda x: ages.get(x))

#### слияние с полным списком uid

In [165]:
result = pd.DataFrame()
result['uid'] = df_predict_ALL_uid[36138:]
result = pd.merge(result, pre_result, how='left', on='uid')
result = result.fillna('-')

In [166]:
sorted_res = result.sort_values(by='uid')

In [167]:
sorted_res.to_csv('project01_gender-age.csv', sep='\t', index=False)

In [None]:
%matplotlib inline
import seaborn as sns

In [None]:
sns_plot = sns.pairplot(df[['url_counts', 'age']], size=4)

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 16, 7
sns.distplot(df['url_counts'])

In [None]:
df['url_counts'].value_counts().head(50).plot(kind='bar')

In [None]:
rcParams['figure.figsize'] = 16, 7
sns.jointplot(df['age'], df['url_counts'], size=10)

In [None]:
%lsmagic

In [None]:
import pickle
with open('data/df_med_keywords_non_age.pickle', 'rb') as file:
    df_med_keywords_non_age = pickle.load(file)

In [None]:
df_med_keywords_non_age['raw_keywords'].loc[36138:]

In [None]:
41138 - 5000

22 + 1