In [1]:
import numpy as np
import pandas as pd
from collections import namedtuple
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import HashingVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

## Подготовка данных

In [2]:
# urls_train
urls_train_df = pd.read_csv('kaggle_data/url_domain_train', header=None, delimiter='\t')
urls_train_df.columns = ['id', 'url', 'count']
urls_train_df = urls_train_df[['id', 'url']]
# urls_test
urls_test_df = pd.read_csv('kaggle_data/url_domain_test', header=None, delimiter='\t')
urls_test_df.columns = ['id', 'url', 'count']
urls_test_df = urls_test_df[['id', 'url']]

In [3]:
urls_train_df.head()
urls_test_df.head()

Unnamed: 0,id,url
0,0000000151004FF4ADD746DA10685A01,afisha.ru
1,0000000151004FF4ADD746DA10685A01,aif.ru
2,0000000151004FF4ADD746DA10685A01,aimfar.solution.weborama.fr
3,0000000151004FF4ADD746DA10685A01,alkotest.ru
4,0000000151004FF4ADD746DA10685A01,aptekamos.ru


In [4]:
# urls_train
urls_train_df = pd.DataFrame(urls_train_df.groupby('id')['url'].apply(lambda x: x.tolist()))
urls_train_df['id'] = urls_train_df.index
urls_train_df.index = range(len(urls_train_df))
urls_train_df.columns = ['urls', 'id']
#urls_test
urls_test_df = pd.DataFrame(urls_test_df.groupby('id')['url'].apply(lambda x: x.tolist()))
urls_test_df['id'] = urls_test_df.index
urls_test_df.index = range(len(urls_test_df))
urls_test_df.columns = ['urls', 'id']

In [5]:
urls_train_df.head()
urls_test_df.head()

Unnamed: 0,urls,id
0,"[1000bankov.ru, 1tv.ru, 4put.ru, argumenti.ru,...",000000014A02348E701552980349FF01
1,"[autorambler.ru, bilettorg.ru, dsol-druzhba.ru...",000000014A10EA183BF8594A0B2AB201
2,"[photosight.ru, rambler.ru]",000000014A4FE5C33A929D4C26943601
3,"[base.consultant.ru, dogovor-obrazets.ru, fd.r...",000000014B7BB9957784A9BC0AC9F401
4,"[assessor.ru, audit-it.ru, base.garant.ru, com...",000000014C7749F896D82C2B01E8B801


In [6]:
age_train_df = pd.read_csv('kaggle_data/age_profile_train', header=None, delimiter='\t')
age_train_df.columns = ['id', 'age']

In [7]:
age_train_df.head()

Unnamed: 0,id,age
0,000000013CB5719C0000A2C90002C101,53
1,00000001442BE24000001B7D00F50801,48
2,00000001448580F800003F1B31FB0901,28
3,0000000145BDB2FF000157971645E901,44
4,000000014602771F0000DB9359714C01,48


In [8]:
train_df = urls_train_df.merge(age_train_df, on='id', how='left')

In [9]:
train_df.head()

Unnamed: 0,urls,id,age
0,"[id.rambler.ru, mail.rambler.ru, r0.ru]",000000013CB5719C0000A2C90002C101,53
1,"[1prime.ru, autorambler.ru, chellak.ru, docs.c...",00000001442BE24000001B7D00F50801,48
2,[bosch-korolev.ru],00000001448580F800003F1B31FB0901,28
3,"[aptekanizkihcen.ua, colady.ru, gorod.dp.ua, i...",0000000145BDB2FF000157971645E901,44
4,"[astrorok.ru, diets.ru, edaplus.info, eshzdoro...",000000014602771F0000DB9359714C01,48


In [10]:
title_train_df = pd.read_csv('kaggle_data/title_unify_train', header=None,delimiter='\t')
title_test_df = pd.read_csv('kaggle_data/title_unify_test', header=None,delimiter='\t')

In [11]:
# title train
title_train_df.columns = ['id', 'title', 'count']
title_train_df = title_train_df[['id', 'title']]
# title test
title_test_df.columns = ['id', 'title', 'count']
title_test_df = title_test_df[['id', 'title']]
title_train_df.head()
title_test_df.head()

Unnamed: 0,id,title
0,0000000151790DCC1E8322AF0B6FA701,20-километровый амур китай мост недвижимость п...
1,0000000151790DCC1E8322AF0B6FA701,24-х 34-х до договор неделя новость предложить...
2,0000000151790DCC1E8322AF0B6FA701,3xl armour compress heatgear long size sleev s...
3,0000000151790DCC1E8322AF0B6FA701,4-е ca зачёт медальный место новость ои-2012 п...
4,0000000151790DCC1E8322AF0B6FA701,4-колёсный moi-bebik oregon oscar ru коляска к...


In [12]:
# title train
title_train_df = pd.DataFrame(title_train_df.groupby('id')['title'].apply(lambda f: f.tolist()))
title_train_df.columns = ['titles']
# title test
title_test_df = pd.DataFrame(title_test_df.groupby('id')['title'].apply(lambda f: f.tolist()))
title_test_df.columns = ['titles']

In [13]:
# title train
title_train_df['id'] = title_train_df.index
title_train_df.index = range(len(title_train_df))
title_train_df.columns = ['titles', 'id']
# title test
title_test_df['id'] = title_test_df.index
title_test_df.index = range(len(title_test_df))
title_test_df.columns = ['titles', 'id']
title_train_df.head()
title_test_df.head()

Unnamed: 0,titles,id
0,[11-й гектар дом коммунизм на набережная недос...,000000014A02348E701552980349FF01
1,[20-летний выглядеть девушка как королева ната...,000000014A10EA183BF8594A0B2AB201
2,[медийный портал рамблер],000000014A4FE5C33A929D4C26943601
3,"[бесплатно код онлайн срочно статистика, или и...",000000014B7BB9957784A9BC0AC9F401
4,[bank privat автокредитование банковский банко...,000000014C7749F896D82C2B01E8B801


In [14]:
train_df = title_train_df.merge(train_df,on='id')
test_df = title_test_df.merge(urls_test_df,on='id')
# train_df = train_df[['id', 'titles', 'urls', 'age']]
train_df.head()
test_df.head()

Unnamed: 0,titles,id,urls
0,[11-й гектар дом коммунизм на набережная недос...,000000014A02348E701552980349FF01,"[1000bankov.ru, 1tv.ru, 4put.ru, argumenti.ru,..."
1,[20-летний выглядеть девушка как королева ната...,000000014A10EA183BF8594A0B2AB201,"[autorambler.ru, bilettorg.ru, dsol-druzhba.ru..."
2,[медийный портал рамблер],000000014A4FE5C33A929D4C26943601,"[photosight.ru, rambler.ru]"
3,"[бесплатно код онлайн срочно статистика, или и...",000000014B7BB9957784A9BC0AC9F401,"[base.consultant.ru, dogovor-obrazets.ru, fd.r..."
4,[bank privat автокредитование банковский банко...,000000014C7749F896D82C2B01E8B801,"[assessor.ru, audit-it.ru, base.garant.ru, com..."


## Снижение размерности

In [28]:
X = np.array(list(map(lambda x: x[1],zip(train_df.urls.values,train_df.titles.values))))
X_test = np.array(list(map(lambda x: x[1],zip(test_df.urls.values,test_df.titles.values))))

In [31]:
y = train_df.age.values

In [32]:
Z = list(map(lambda x: list(map(lambda y: y.split(" "), x)), X)) # split internal phrases into words
Z_test = list(map(lambda x: list(map(lambda y: y.split(" "), x)), X_test))

In [33]:
T = list(map(lambda l :[item for sublist in l for item in sublist], Z)) # flatten internal lists of lists
T_test = list(map(lambda l :[item for sublist in l for item in sublist], Z_test))

In [34]:
Y = list(map(lambda x: TaggedDocument(words=x[1],tags=[x[0]]),enumerate(list(T)))) # create TaggedDocuments for doc2vec
Y_test = list(map(lambda x: TaggedDocument(words=x[1],tags=[len(Y) + x[0]]),enumerate(list(T_test))))

In [36]:
doc2vec = Doc2Vec(Y + Y_test, size=1000, window=8, min_count=5, workers=4)

In [37]:
np.array(doc2vec.docvecs).shape

(134045, 1000)

In [38]:
C = np.array(doc2vec.docvecs)[:len(Y)]
C_test = np.array(doc2vec.docvecs)[len(Y):]

In [27]:
C[0]

array([ -4.02741646e-03,   5.17490553e-04,  -4.23386646e-03,
         3.95179540e-03,  -6.67527830e-03,  -1.09198103e-02,
        -1.82472472e-03,   3.94937210e-03,   7.88062625e-03,
         2.23867915e-04,   3.98492534e-03,   6.13822922e-05,
        -6.72318321e-03,   4.96734073e-03,  -3.84034030e-03,
         2.87004816e-03,  -1.71870121e-03,   5.79001894e-03,
         3.97522235e-03,   4.43495531e-03,  -5.50824311e-03,
        -2.68577086e-03,  -3.39229917e-03,   6.41911116e-04,
        -2.31814804e-03,   2.00447175e-04,   4.32373025e-03,
        -6.01968379e-04,   2.32529570e-03,   1.14866020e-03,
        -4.19715373e-03,   2.72591889e-04,   1.95418997e-03,
         7.05786375e-03,  -2.11957330e-03,   7.19742430e-03,
         1.27897691e-03,   3.30770435e-03,  -3.98037676e-03,
        -4.48795792e-04,  -1.76177057e-03,  -1.51950796e-03,
        -1.81869499e-03,   5.97525109e-03,  -1.91085506e-03,
         4.10973607e-03,   4.42632334e-03,   1.55657617e-04,
        -3.11657554e-03,

## Обучение модели

In [26]:
reg = LinearRegression()
- cross_val_score(reg, C, y, scoring='mean_squared_error')

array([ 164.4977322 ,  163.2250262 ,  147.03983202])

## Отправка Решения

In [24]:
reg = LinearRegression()
reg.fit(C, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [93]:
urls_test_df = pd.read_csv('kaggle_data/url_domain_test', header=None, delimiter='\t')
urls_test_df.columns = ['id', 'url', 'count']
urls_test_df = urls_test_df[['id', 'url']]

In [94]:
urls_test_df = pd.DataFrame(urls_test_df.groupby('id')['url'].apply(lambda x: x.tolist()))
urls_test_df['id'] = urls_test_df.index
urls_test_df.index = range(len(urls_test_df))
urls_test_df.columns = ['urls', 'id']

In [95]:
urls_test_df.head()

Unnamed: 0,urls,id
0,"[1000bankov.ru, 1tv.ru, 4put.ru, argumenti.ru,...",000000014A02348E701552980349FF01
1,"[autorambler.ru, bilettorg.ru, dsol-druzhba.ru...",000000014A10EA183BF8594A0B2AB201
2,"[photosight.ru, rambler.ru]",000000014A4FE5C33A929D4C26943601
3,"[base.consultant.ru, dogovor-obrazets.ru, fd.r...",000000014B7BB9957784A9BC0AC9F401
4,"[assessor.ru, audit-it.ru, base.garant.ru, com...",000000014C7749F896D82C2B01E8B801


In [None]:
titles_test_df = 

In [18]:
y_pred = reg.predict(X)

In [19]:
y_pred

array([ 42.68803957,  39.76833715,  38.71313275, ...,  35.32366851,
        37.30954375,  49.12761871])

In [20]:
urls_test_df['age'] = y_pred

In [21]:
urls_test_df = urls_test_df[['id', 'age']]
urls_test_df.columns = ['Id', 'age']

In [22]:
urls_test_df.head()

Unnamed: 0,Id,age
0,000000014A02348E701552980349FF01,42.68804
1,000000014A10EA183BF8594A0B2AB201,39.768337
2,000000014A4FE5C33A929D4C26943601,38.713133
3,000000014B7BB9957784A9BC0AC9F401,32.49378
4,000000014C7749F896D82C2B01E8B801,33.155544


In [29]:
random_sol = pd.read_csv('random_solution.csv')
miss_idx = set(random_sol.Id.values) - set(urls_test_df.Id.values)
miss_df = pd.DataFrame(list(zip(list(miss_idx), np.ones(len(miss_idx)))))
miss_df.columns = ['Id', 'age']

In [30]:
urls_test_df = urls_test_df.append(miss_df, ignore_index=True)

In [31]:
urls_test_df.to_csv('solution.csv', index=False)

In [32]:
!wc -l solution.csv

   19980 solution.csv
