In [162]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as sps
plt.style.use('ggplot')
from sympy import *
import copy
from matplotlib import cm
from scipy import sparse
import scipy.sparse as sprs
from sklearn.svm import SVC
from scipy.sparse.csr import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn import cross_validation
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier
from joblib import Parallel, delayed
import multiprocessing
from sklearn.model_selection import GridSearchCV
%matplotlib inline

In [22]:
row_train = pd.read_csv('linear_train.txt',header=-1)
row_test = pd.read_csv('linear_test.txt',header=-1)

In [23]:
row_train.head()

Unnamed: 0,0,1
0,Аалтонен,1
1,Аар,0
2,Аарон,0
3,ААРОН,0
4,Аарона,0


In [24]:
row_test.head()

Unnamed: 0,0
0,Аалто
1,ААР
2,Аара
3,Ааре
4,Аарон


In [47]:
def add_column(frame, data, names):
    new_fr = pd.DataFrame(data, columns=names)
    return pd.concat((frame, new_fr),axis=1)

def gen_suffixes(frame, slen):
    suff = set()
    for w in frame[0]:
        for l in range(1,slen+1):
            suff.add(w[-l:])
    return suff

def add_strs(frame,suffixes=None):
    if suffixes is None:
        suffixes = gen_suffixes(frame)
#     for suff in suffixes:
    suffixes_set = suffixes
    suffixes = list(suffixes)
    data = np.zeros((len(frame[0]),len(suffixes)))
    numb = {suffixes[i]:i for i in range(len(suffixes))}
    strs = frame[0]
    for i in range(len(strs)):
        for k in range(len(strs[i])):
            for j in range(2):
                suff = strs[i][k:]
                if j > 0 and len(suff) > 1:
                    suff = suff[:-1]
                if suff in suffixes_set:
                    cur = numb[suff]
                    data[i][cur] = 1
    frame = add_column(frame, data, suffixes)
    frame = frame.drop([0],axis=1)
    if 1 in frame.columns:
        frame = frame.drop([1],axis=1)
    return sprs.coo_matrix(frame), suffixes_set

In [26]:
def add_custom(frame, f, name):
    data = np.array([f(x) for x in frame[0]]).reshape((len(frame[0]),1))
    return add_column(frame, data, [name])

def normalize(frame, name):
    frame[name] = (np.array(frame[name]) - np.mean(frame[name]))/np.std(frame[name])
    return frame

def clear(train, test):
    lenn = len(train[0])
    for c in test.columns:
        if c != 0:
            summ = np.sum(np.array(train[c]))
            if summ == lenn or summ == 0:
                train = train.drop([c], axis=1)
                test = test.drop([c],axis=1)
    return train, test

def normalize_all(train, test):
    for c in test.columns:
        if c != 0:
            arr = np.array(train[c])
            m = np.mean(arr)
            s = np.std(arr)
            train[c] = (train[c] - m)/s
            test[c] = (test[c] - m)/s
    return train, test

In [7]:
%%time
for i in range(len(row_train[0])):
    s = row_train[0][i]
    if row_train[1][i] == 1:
        if s[-1] == 'а' and len(s) > 1:
            row_train.loc[len(row_train[0])]=([s[:-1],1])

CPU times: user 32.3 s, sys: 9.1 s, total: 41.4 s
Wall time: 42 s


In [212]:
row_train.tail()

Unnamed: 0,0,1
101403,Ёлкин,1
101404,ёлкой,0
101405,ёлок,0
101406,ёлочкой,0
101407,ёмкость,0


In [156]:
def get_suff(train, maxlen=5):
    td = dict()
    idf = dict()
    for i in range(len(train[0])):
        s = train[0][i]
        for l in range(1,maxlen):
            if len(s) < l:
                continue
            for j in [0,1]:
                suffx = s[-l:]
                if j == 1 and len(suffx) > 1:
                    suffx = suffx[:-1]

                if suffx[0].isupper() and not suffx.isupper():
                    continue
                if suffx not in td:
                    td[suffx] = 0
                    idf[suffx] = 0
                idf[suffx] += 1
                if train[1][i] == 1:
                    td[suffx] += 1

    tdidf = []
    s1 =  np.sum(train[1])
    s2 = len(train[1])
    alpha = (s2-s1)/s1
    for key in td:
        tdidf.append(((td[key] - alpha*(idf[key] - td[key])) * np.log(len(train[0])/idf[key]),key))
#         tdidf.append((td[key] * np.log(len(train[0])/idf[key]),key))
    tdidf = sorted(tdidf)
    return [w[1] for w in tdidf[-2000:]]

In [220]:
def add_count_vectorizer_features(train, test):
    vectorizer = CountVectorizer(min_df=2, max_df=.9,
                             max_features=None,
                             ngram_range=(2, 10),
                             lowercase=False,
                             analyzer='char_wb', 
                             binary=True)
    TTrainM = vectorizer.fit_transform(train[0])
    TTestM = vectorizer.transform(test[0])
    return TTrainM, TTestM

def add_tdidf_vectorizer_features(train, test):
    vectorizer = TfidfVectorizer(min_df=2, max_df=.9,
                             max_features=None,
                             ngram_range=(2, 10),
                             lowercase=False,
                             analyzer='char_wb', 
                             binary=True)
    TTrainM = vectorizer.fit_transform(train[0])
    TTestM = vectorizer.transform(test[0])
    return TTrainM, TTestM

In [221]:
def add_custom_features(train, test):
    Train = add_custom(train, (lambda s: len(s)),'_len')
    Test = add_custom(test, (lambda s: len(s)),'_len')
#     Train = add_custom(Train, (lambda s: s[0].isupper()), '0_upper')
#     Test = add_custom(Test, (lambda s: s[0].isupper()), '0_upper')
#     Train = add_custom(Train, (lambda s: s[1:].islower()), '1_lower')
#     Test = add_custom(Test, (lambda s: s[1:].islower()), '1_lower')
#     Train = add_custom(Train, (lambda s: s[0].isupper() and s[1:].islower()), 'up_low')
#     Test = add_custom(Test, (lambda s: s[0].isupper() and s[1:].islower()), 'up_low')
#     Train = add_custom(Train, (lambda s: not s.isalpha()), 'trash')
#     Test = add_custom(Test, (lambda s: not s.isalpha()), 'trash')
    Train = Train.drop([0,1],axis=1)
    Test = Test.drop([0],axis=1)
    if 1 in Test.columns:
        Test = Test.drop([1],axis=1)
    return np.array(Train,dtype=float), np.array(Test,dtype=float)

In [222]:
from joblib import Parallel, delayed
import multiprocessing

In [223]:
def gen_features_frames(train, test):
    train_vect, test_vect = add_count_vectorizer_features(train, test)
    train_my_f, test_my_f = add_custom_features(train, test)
#     train_tdidf, test_tdidf = add_tdidf_vectorizer_features(train, test)
#     best = get_suff(train,8)
#     train_suff,dum = add_strs(train,suffixes=best)
#     test_suff,dum = add_strs(test, suffixes=best)
    Train = sprs.hstack([train_vect, sprs.coo_matrix(train_my_f)])
    Test = sprs.hstack([test_vect,  sprs.coo_matrix(test_my_f)])
#     Train = sprs.hstack([train_vect, train_tdidf, sprs.coo_matrix(train_my_f), train_suff])
#     Test = sprs.hstack([test_vect, test_tdidf, sprs.coo_matrix(test_my_f), test_suff])
    
#     Train, Test = train_vect, test_vect
    return Train, Test

def do(train_indices, test_indices, Train, Y,clf,rep=1):
        Train_train = Train.loc[train_indices]
        Train_test = Train.loc[test_indices]
        Train_train.index = np.arange(len(train_indices))
        Train_test.index = np.arange(len(test_indices))
        x_train, x_test = gen_features_frames(Train_train, Train_test)
        y_train = Y[train_indices]
        y_test = Y[test_indices]
        y_pred = np.zeros(len(y_test))
        for t in range(rep):
            clf.fit(sprs.coo_matrix(x_train),y_train)
            y_pred += np.log(np.array(clf.predict_proba(sprs.coo_matrix(x_test))[:,1]))
#         y_pred = [y if y > 1e-5 else 0 for y in y_pred]
#         y_pred = np.exp(y_pred/rep)
        return roc_auc_score(y_test, y_pred)

def cross_val(Train,  clf, folds=4, jobs=1):
    score = []
    Y = Train[1]
    score = Parallel(n_jobs=jobs)(delayed(do)(train_indices, test_indices, Train, Y,clf)
                               for train_indices, test_indices 
                               in cross_validation.KFold(len(Y), n_folds = folds))
    return [np.mean(score), np.std(score)]

In [225]:
clfs = [
        LogisticRegression(penalty='l1',
                    max_iter=1000,
                    C=10,
                    class_weight='balanced',
                    verbose=True),
#     RandomForestClassifier(max_depth=4,n_estimators=10,n_jobs=1,verbose=True)
#     SVC(class_weight='balanced',kernel='poly',degree=4,coef0=1,gamma=1)
]

In [224]:
%%time


ans = []
for clf in clfs:
    ans.append(cross_val(row_train, clf, 3,3))
    print(ans[-1])



[0.84712799162819896, 0.0025336517000350703]
CPU times: user 232 ms, sys: 87.5 ms, total: 319 ms
Wall time: 42.6 s


In [None]:
[0.86820289706981313, 0.0052956574159349831] - difficult tdidf
[0.86792215132234762, 0.0059266876611548044] - easy tdidf


In [227]:
def calc_score(Train,Test,clf,rep=1):
    Y = Train[1]
    x_train, x_test = gen_features_frames(Train, Test)
    y_pred = np.zeros(len(Test[0]))
    for t in range(rep):
        clf.fit(sprs.coo_matrix(x_train),Y)
        y_pred += np.log(np.array(clf.predict_proba(sprs.coo_matrix(x_test))[:,1]))
    y_pred = np.exp(y_pred/rep)
    return y_pred

In [228]:
%%time
clf = LogisticRegression(penalty='l1',
                    max_iter=1000,
                    C=10,
                    class_weight='balanced',
                    verbose=True)

y_pred = calc_score(row_train, row_test, clf)

[LibLinear]CPU times: user 1min 31s, sys: 1.86 s, total: 1min 33s
Wall time: 1min 34s


In [208]:
def save_ans(_y_test, filename):
    print(_y_test)
    names = np.arange(len(_y_test))
    ans = pd.DataFrame(data=np.matrix(_y_test).T, columns=['Answer'])
    ans = pd.concat((pd.DataFrame(names,columns=['Id']),ans),axis=1)
    ans.to_csv(filename,index=None)

def predict(clf,file,X,Y,_test,times=1):
    pred = np.zeros(len(row_test[0]))
    for t in range(times):
        clf.fit(X,Y)
        pred += clf.predict_proba(_test)[:,1]
    pred /= times
    save_ans(pred, file)

In [229]:
save_ans(y_pred, 'contest1_full_mix2')

[  2.71748855e-01   3.44896920e-01   5.99139692e-02 ...,   5.41676973e-04
   3.39194869e-07   8.35974154e-07]


In [15]:
files = [ 'countVectorizer1'
        ]

for f in files:
    data = np.array(pd.read_csv(f)['Answer'])
for i in range(len(data)):
    if data[i] < 0.1:
        data[i] = 0
    if data[i] > 0.99:
        data[i]  = 1
    
save_ans(data, 'countVectorizer1_cliped0_1_and_0_99')

[ 0.27163323  0.18229266  0.         ...,  0.          0.          0.        ]


In [69]:
print(data)

[array([ 0.22736869,  0.10822005,  0.18498921, ...,  0.12425834,
        0.0267048 ,  0.0267048 ]), array([ 0.31276735,  0.16688845,  0.31374668, ...,  0.11295054,
        0.00637569,  0.01141766])]


In [80]:
a = np.array([[1,2],[3,4]])
a[:,1]

array([2, 4])