In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import manifold, decomposition, linear_model, ensemble, neighbors, cross_validation
%matplotlib inline
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt

from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline



In [2]:
def rmse(x,y):
    return np.mean((x - y) ** 2) ** 0.5

## Подготовка данных (urls)

In [3]:
data_prefix = "./data/"

In [4]:
urls_train_df = pd.read_csv(data_prefix + 'url_domain_train', header=None, delimiter='\t')
urls_train_df.columns = ['id', 'url', 'count']
urls_train_df = urls_train_df[['id', 'url']]

In [5]:
urls_train_df.head()

Unnamed: 0,id,url
0,000000014B60815F65B38258011B6C01,login.rutracker.org
1,000000014B60815F65B38258011B6C01,rutracker.org
2,000000014C03DA2A47AC433A0C755201,admin.tour-spb.net
3,000000014C03DA2A47AC433A0C755201,czinfo.ru
4,000000014C03DA2A47AC433A0C755201,forumsostav.ru


In [6]:
urls_train_df = pd.DataFrame(urls_train_df.groupby('id')['url'].apply(lambda x: x.tolist()))
urls_train_df['id'] = urls_train_df.index
urls_train_df.index = range(len(urls_train_df))
urls_train_df.columns = ['urls', 'id']

In [7]:
urls_train_df.head()

Unnamed: 0,urls,id
0,"[id.rambler.ru, mail.rambler.ru, r0.ru]",000000013CB5719C0000A2C90002C101
1,"[1prime.ru, autorambler.ru, chellak.ru, docs.c...",00000001442BE24000001B7D00F50801
2,[bosch-korolev.ru],00000001448580F800003F1B31FB0901
3,"[aptekanizkihcen.ua, colady.ru, gorod.dp.ua, i...",0000000145BDB2FF000157971645E901
4,"[astrorok.ru, diets.ru, edaplus.info, eshzdoro...",000000014602771F0000DB9359714C01


In [8]:
age_train_df = pd.read_csv(data_prefix + 'age_profile_train', header=None, delimiter='\t')
age_train_df.columns = ['id', 'age']

In [9]:
age_train_df.head()

Unnamed: 0,id,age
0,000000013CB5719C0000A2C90002C101,53
1,00000001442BE24000001B7D00F50801,48
2,00000001448580F800003F1B31FB0901,28
3,0000000145BDB2FF000157971645E901,44
4,000000014602771F0000DB9359714C01,48


In [10]:
average_age = age_train_df['age'].mean();
print average_age

36.0434702011


In [11]:
train_df = urls_train_df.merge(age_train_df, on='id', how='left')

In [12]:
train_df.head()

Unnamed: 0,urls,id,age
0,"[id.rambler.ru, mail.rambler.ru, r0.ru]",000000013CB5719C0000A2C90002C101,53
1,"[1prime.ru, autorambler.ru, chellak.ru, docs.c...",00000001442BE24000001B7D00F50801,48
2,[bosch-korolev.ru],00000001448580F800003F1B31FB0901,28
3,"[aptekanizkihcen.ua, colady.ru, gorod.dp.ua, i...",0000000145BDB2FF000157971645E901,44
4,"[astrorok.ru, diets.ru, edaplus.info, eshzdoro...",000000014602771F0000DB9359714C01,48


## Снижение размерности

In [13]:
print len(train_df.urls)
split_koff = 2. / 3.
topk = int(len(train_df.urls) * split_koff)
print topk

118603
79068


In [14]:

#X_train, y_train = train_df.urls.values[:topk], train_df.age.values[:topk]
#X_test, y_test = train_df.urls.values[topk:], train_df.age.values[topk:]

X = train_df.urls
y = train_df.age

itrain, itest = cross_validation.train_test_split(range(len(X)), test_size=1./3, random_state=0)
len(itrain), len(itest)

(79068, 39535)

In [15]:
def hwTransform(X) :
    X = map(lambda x: ' '.join(x), X)
    return hw.transform(X).todense()

In [16]:
%%time

hw = HashingVectorizer(n_features=1000, non_negative=True).fit(X)
X = hwTransform(X)

CPU times: user 12.5 s, sys: 352 ms, total: 12.9 s
Wall time: 12.8 s


In [17]:
%%time

transformer = sklearn.decomposition.NMF(n_components=100)
X_nmf = transformer.fit_transform(X)

CPU times: user 1h 2min 23s, sys: 5min 27s, total: 1h 7min 51s
Wall time: 24min 21s


## Обучение модели

In [24]:
from sklearn.neighbors import KNeighborsRegressor
reg = KNeighborsRegressor()
- cross_val_score(reg, X[itrain,:], y[itrain], scoring='mean_squared_error')

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


array([ 187.01915313,  186.2865154 ,  208.7627379 ])

In [25]:
reg.fit(X[itrain, :], y[itrain])
print rmse(y[itest], reg.predict(X[itest, :]))

13.6396937458


## Отправка Решения

In [34]:
#reg = LinearRegression()
#reg.fit(X, y)

In [35]:
urls_test_df = pd.read_csv(data_prefix + 'url_domain_test', header=None, delimiter='\t')
urls_test_df.columns = ['id', 'url', 'count']
urls_test_df = urls_test_df[['id', 'url']]

In [36]:
urls_test_df = pd.DataFrame(urls_test_df.groupby('id')['url'].apply(lambda x: x.tolist()))
urls_test_df['id'] = urls_test_df.index
urls_test_df.index = range(len(urls_test_df))
urls_test_df.columns = ['urls', 'id']

In [37]:
urls_test_df.head()

Unnamed: 0,urls,id
0,"[1000bankov.ru, 1tv.ru, 4put.ru, argumenti.ru,...",000000014A02348E701552980349FF01
1,"[autorambler.ru, bilettorg.ru, dsol-druzhba.ru...",000000014A10EA183BF8594A0B2AB201
2,"[photosight.ru, rambler.ru]",000000014A4FE5C33A929D4C26943601
3,"[base.consultant.ru, dogovor-obrazets.ru, fd.r...",000000014B7BB9957784A9BC0AC9F401
4,"[assessor.ru, audit-it.ru, base.garant.ru, com...",000000014C7749F896D82C2B01E8B801


In [38]:
X_run = urls_test_df.urls.values
X_run = hwTransform(X_run)

In [39]:
y_pred = reg.predict(X_run)

In [40]:
y_pred

array([ 40.31803022,  41.19337166,  45.74825277, ...,  36.54111604,
        36.60077867,  47.69017506])

In [41]:
urls_test_df['age'] = y_pred

In [42]:
urls_test_df = urls_test_df[['id', 'age']]
urls_test_df.columns = ['Id', 'age']

In [43]:
urls_test_df.head()

Unnamed: 0,Id,age
0,000000014A02348E701552980349FF01,40.31803
1,000000014A10EA183BF8594A0B2AB201,41.193372
2,000000014A4FE5C33A929D4C26943601,45.748253
3,000000014B7BB9957784A9BC0AC9F401,31.050921
4,000000014C7749F896D82C2B01E8B801,30.572697


In [44]:
random_sol = pd.read_csv(data_prefix + 'sample_submission.csv')
miss_idx = set(random_sol.Id.values) - set(urls_test_df.Id.values)
miss_df = pd.DataFrame(zip(list(miss_idx), np.ones(len(miss_idx))))
miss_df.columns = ['Id', 'age']

In [45]:
for i in range(len(miss_idx)):
    miss_df['age'][i] = average_age

print miss_df['age'][0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


36.0434702011


In [46]:
urls_test_df = urls_test_df.append(miss_df, ignore_index=True)

In [47]:
urls_test_df.to_csv(data_prefix + 'mysolution.csv', index=False)

In [48]:
!wc -l ./data/mysolution.csv

19980 ./data/mysolution.csv


## Blenging с помощью heamy

In [18]:
from sklearn import pipeline, preprocessing, feature_extraction

In [19]:
%%time

# обычный датасет
dataset1 = Dataset(X[itrain, :], y[itrain], X[itest, :])
# NMF
dataset2 = Dataset(X_nmf[itrain, :], y[itrain], X_nmf[itest, :])

# линейная модель на обычных данных
lr = Regressor(dataset=dataset1, 
                     estimator=linear_model.LinearRegression,
                     parameters={'normalize': True},
                     name='lr')

# линейная модель на TFIDF от обычных данных
def tfidf_lr_(X_train, y_train, X_test, y_test=None, random_state=8888):
    model = pipeline.Pipeline([('tfidf', feature_extraction.text.TfidfTransformer()), 
                                ('linear_model', linear_model.LinearRegression())])
    model.fit(X_train, y_train)
    return model.predict(X_test)

tfidf_lr = Regressor(dataset=dataset1,
                     estimator=tfidf_lr_,
                     name='tfidf_lr')

# ExtraTrees на NMF
rf = Regressor(dataset=dataset2, 
                     estimator=ensemble.ExtraTreesRegressor,
                     parameters={'n_estimators': 100, 'n_jobs': -1},
                     name='rf')

# Stack two models
# Returns new dataset with out-of-fold predictions
meta_pipeline = ModelsPipeline(lr, tfidf_lr, rf)
stack_ds = meta_pipeline.blend(proportion=0.2,seed=111)

# Train LinearRegression on stacked data (second stage)
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()

print rmse(y[itest], results)

IOError: could not get source code