In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:
from urllib import request as req

data_dir = '/Users/ssv/newprolab/project01'
filename = 'gender_age_dataset.txt'
file_path = '/'.join([data_dir, filename])
url = 'http://data.newprolab.com/data-newprolab-com/project01/' + filename

# req.urlretrieve(url, file_path)

In [None]:
import pandas as pd
import json
from urllib.parse import urlparse
from datetime import datetime

from tqdm import tqdm_notebook

In [None]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 4000)

In [None]:
df = pd.read_csv(file_path, sep='\t')

### Оцифровка значений пола и возраста

In [None]:
ages = {'18-24': 1, '25-34': 2, '35-44': 3, '45-54': 4, '>=55': 5}
df["age"] = df["age"].apply(lambda x: ages.get(x, '-'))

In [None]:
genders = {'M': 1, 'F': 0}
df["gender"] = df["gender"].apply(lambda x: genders.get(x, '-'))


### Формирование набора данных с ключевыми словами посещенных сайтов

#### Загружаем из файлов типа df_med_keywords_0000_0999.pickle по мере поступления. Скрапинг сайтов идет очень медленно

In [None]:
import pickle

In [None]:
with open('data/keywords_999.pickle', 'rb') as file:
    df_med_keywords_0000_0999 = pickle.load(file)

In [None]:
with open('data/keywords_2000_6999.pickle', 'rb') as file:
    df_med_keywords_2000_6999 = pickle.load(file)

In [None]:
with open('data/df_med_keywords_non_age.pickle', 'rb') as file:
    df_for_pred = pickle.load(file)

#### Удаляем лишние столбцы в наборе данных после скрапинга

In [None]:
df_med_keywords_0000_0999 = df_med_keywords_0000_0999.drop('url_netloc', axis=1)

In [None]:
df_med_keywords_2000_6999 = df_med_keywords_2000_6999.drop('url_netloc', axis=1)

In [None]:
df_for_pred = df_for_pred.drop(['url_netloc', 'age'], axis=1)

#### Соединяем загруженные таблицы

In [None]:
df_k = pd.merge(df_med_keywords_0000_0999, pd.merge(df_med_keywords_2000_6999, df_for_pred, how='left', on='uid'),
              how='left', on='uid')

#### Сводим ключевые слова в один столбец

In [None]:
df['keywords'] = ''
for index, row in tqdm_notebook(df_k.iterrows()):
    if index < 1000:
        df_k['raw_keywords'].loc[index] = df_k['raw_keywords'].iloc[index]
    if index >= 2000 and index < 7000:
        df_k['raw_keywords'].loc[index] = df_k['raw_keywords_x'].iloc[index]
    if index > 36137:    
        df_k['raw_keywords'].loc[index] = df_k['raw_keywords_y'].iloc[index]
    

#### Левое последовательное соединение таблиц по uid. Удаляем лишние столбцы после соединения 

In [None]:
df_k = df_k.drop(['raw_keywords_x', 'raw_keywords_y'], axis=1)

In [None]:
df = pd.merge(df, df_k, how='left', on='uid')

In [None]:
df = df.drop(['user_json', 'keywords'], axis=1)

####  В результате - имеем набор df['gender', 'age', 'uid', 'raw_keywords']. Ключевые слова посещенных сайтов заполнены для строк 0 - 999; 2000 - 6999 и таргет: 36138 - 41137. В заполненных строках есть пустые значения (массив длиной 0)

1. Вытащим все ключевые слова для формирования массива стоп-слов
2. Разобьем датасет на два фолда: для тренировки (0000 - 36137) и предсказания (36138 - 41137)
3. Из фолдов удалим строки в которых нет ключевых слов сайтов
4. Сохраняем таргеты и тренируем модель

In [None]:
raw_words = list(df['raw_keywords'].copy())

In [None]:
for item in tqdm_notebook(reversed(raw_words)):
    if len(item) == 0:
        raw_words.remove(item)

In [None]:
import pymorphy2
m = pymorphy2.MorphAnalyzer()

In [None]:
import re
GROUPING_SPACE_REGEX = re.compile(r'([^\w]|[+])', re.UNICODE)

def simple_word_tokenize(text, _split=GROUPING_SPACE_REGEX.split):
    return [t for t in _split(text.lower()) if t and not t.isspace()]

In [None]:
def token_r(text):
    words = simple_word_tokenize(text)
    return [m.parse(x)[0].normal_form for x in words if len(x) >= 4]

In [None]:
raw_words = [' '.join((x)) for x in raw_words]

In [None]:
clear_words = [' '.join(token_r(x)) for x in raw_words]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer()
matrix = cv.fit_transform(clear_words)
stop_ww = sorted(cv.vocabulary_.items(), reverse=True)[:552] + sorted(cv.vocabulary_.items())[:1451]
stop_w = [x[0] for x in stop_ww]

In [None]:
# sorted(cv.vocabulary_.items())[1451:]

In [None]:
# sorted(cv.vocabulary_.items(), reverse=True)[552:]

## Построение модели

1. Подготовка наборов данных
    - Разобьем датасет на два фолда: для тренировки (0000 - 36137) и предсказания (36138 - 41137)
    - Из фолдов удалим строки в которых нет ключевых слов сайтов

In [None]:
df_train = df.loc[0:36137]

In [None]:
df_train['raw_keywords'] = df_train['raw_keywords'].apply(lambda x: ' '.join(x))

In [None]:
df_train = df_train[df_train['raw_keywords'] != '']

In [None]:
df_train.columns

In [None]:
df_train['raw_keywords']

#### Строим модель для обучения

In [None]:
target_age = df_train['age']
target_gender = df_train['gender']
df_train_uid = df_train['uid']
df_train = df_train.drop(['age', 'gender', 'uid'], axis=1)

In [None]:
cv = CountVectorizer(tokenizer=token_r, stop_words=stop_w)
matrix = cv.fit_transform(df_train['raw_keywords'])

In [None]:
matrix[0][0]

In [None]:
matrix.shape

In [None]:
with open('data/matrix.pickle', 'rb') as file:
    matrix = pickle.load(file)

In [None]:
with open('data/matrix.pickle', 'wb') as file:
    pickle.dump(matrix, file)     

In [None]:
print(matrix[1])

In [582]:
target_gender = target_gender.astype('int32')
target_age = target_age.astype('int32')

In [583]:
from sklearn.cross_validation import train_test_split

In [589]:
X_train, X_test, y_train, y_test = train_test_split(
    matrix, 
    target_gender, 
    test_size=0.33,
    random_state=42)

### LogisticRegression

In [590]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l1')
lr.fit(X_train, y_train)
pred = lr.predict(X_test)

### RandomForestClassifier

In [602]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=0, n_jobs=-1)
rfc.fit(X_train, y_train) 
pred = rfc.predict(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = GradientBoostingClassifier(n_estimators=100, class_weight='balanced', random_state=0, n_jobs=-1)
rfc.fit(X_train, y_train) 
pred = rfc.predict(X_test)

In [603]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred)

In [604]:
metrics.auc(fpr, tpr)

0.63079980129160451

In [606]:
metrics.accuracy_score(y_test, pred)

0.63175487465181057

In [607]:
metrics.precision_score(y_test, pred)

0.62828282828282833

In [608]:
metrics.recall_score(y_test, pred)

0.67978142076502734

In [None]:
df_predict = df.loc[36138:]

In [None]:
df_predict_ALL_uid = df['uid'] # все UID до удаления тех, для которых НЕ будет выполняться прогноз !!!!

In [None]:
df_predict['raw_keywords'] = df_predict['raw_keywords'].apply(lambda x: ' '.join(x))

In [None]:
df_predict = df_predict[df_predict['raw_keywords'] != '']

#### Сохраняем столбцы age, gender - как будущий target для модели и столбец url_counts - как возможную фичу

In [None]:
target_age = df_predict['age']
target_gender = df_predict['gender']
df_predict_uid = df_predict['uid']
df_predict = df_predict.drop(['age', 'gender', 'uid'], axis=1)

In [None]:
futures = 

In [None]:
df['raw_keywords']

In [None]:
target_age = df['age']
target_gender = df['gender']
url_counts = df['url_counts']

In [None]:
df = df.drop(['age', 'gender', 'url_counts'], axis=1)

In [None]:
df.columns

In [None]:
%matplotlib inline
import seaborn as sns

In [None]:
sns_plot = sns.pairplot(df[['url_counts', 'age']], size=4)

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 16, 7
sns.distplot(df['url_counts'])

In [None]:
df['url_counts'].value_counts().head(50).plot(kind='bar')

In [None]:
rcParams['figure.figsize'] = 16, 7
sns.jointplot(df['age'], df['url_counts'], size=10)

In [None]:
%lsmagic

In [None]:
import pickle
with open('data/df_med_keywords_non_age.pickle', 'rb') as file:
    df_med_keywords_non_age = pickle.load(file)

In [None]:
df_med_keywords_non_age['raw_keywords'].loc[36138:]

In [None]:
41138 - 5000