In [310]:
import pandas as pd
import sklearn
import re
import numpy as np
import pymorphy2
import pymorphy2_dicts_ru
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

In [311]:
train = pd.read_csv("linear_train.txt", header = None)
train.columns = ['word', 'ans']


In [312]:
test = pd.read_csv("linear_test.txt", header = None)
test.columns = ['word']


In [313]:
def is_upper_case(row):
    word = str(row['word'])
    if word[0] == word[0].upper():
        return 1.
    else:
        return 0.


ends = set(['ко', 'ук', 'юк', 'ун', 'ний', 'ный', 'чай', 'ий', 'ин', 'ишин', 'джи', 'оглу', 'джи', 'оглу',
        'уа', 'ипа', 'заде', 'ли', 'лы', 'оглу', 'кызы', 'ян', 'янц', 'уни', 'ич', 'ук', 'ик', 'ски', 
        'ев', 'ов', 'огло', 'пулос', 'кос', 'иди', 'швили', 'дзе', 'ури', 'иа', 'уа', 'ава', 'ли', 'си', 'ни', 
        'ини', 'ис', 'ску', 'ан', 'ын', 'шкин', 'кин', 'ин', 'ман', 'ер', 'ски', 'цки',
        'дзки', 'ев', 'ов', 'ских', 'ску', 'ан', 'овая', 'ичу', 'ича', 'ен', 'ес', 'еса', 'вским', 
            'вскому', 'цевой', 'цевая'])

def is_ends(row):
    for end in ends:
        if re.match('\w+' + end +'$', row['word'].lower()):
            return 1.     
    return 0.



In [314]:
morph = pymorphy2.MorphAnalyzer()

def is_nown(row):
    morths = morph.parse(row['word'])
    prob = 0
    l = len(morths)
    for m in morths:
        tag = m.tag
        if 'NOUN' in tag:
            if l == 1:
                return 1
            else:
                prob += m.score
    return prob

def is_surname(row):
    morths = morph.parse(row['word'])
    prob = 0
    l = len(morths)
    for m in morths:
        tag = m.tag
        if 'Surn' in tag:
            if l == 1:
                return 1
            else:
                prob += m.score
    return prob


In [315]:
train['is_upper'] = train.apply(is_upper_case, axis=1)
test['is_upper'] = test.apply(is_upper_case, axis=1)

In [316]:
lower = train.loc[train['is_upper'] == 0]
l = list(lower['word'])
lower_set = set(map(lambda x: x.lower(),l))


def check_in_lower(row):
    if row['word'] in lower_set:
        return 1
    else:
        return 0


train['in_lower'] = train.apply(check_in_lower, axis=1)
test['in_lower'] = test.apply(check_in_lower, axis=1)


In [317]:
train['is_ends'] = train.apply(is_ends, axis=1)
test['is_ends'] = test.apply(is_ends, axis=1)


In [318]:
train['is_sur'] = train.apply(is_surname, axis=1)
test['is_sur'] = test.apply(is_surname, axis=1)


In [319]:
train['is_nown'] = train.apply(is_nown, axis=1)
test['is_nown'] = test.apply(is_nown, axis=1)


In [320]:
roc_auc = sklearn.metrics.make_scorer(roc_auc_score)

In [287]:
from sklearn.model_selection import GridSearchCV

parameters = {"penalty": ('l1',), 'C': np.arange(0.1, 1., 0.1) }
algo =  LogisticRegression()
regress = GridSearchCV(algo, parameters, scoring=roc_auc, n_jobs=-1)
regress.fit(train_X, train_Y)
print(regress.best_estimator_, regress.best_score_)



LogisticRegression(C=0.70000000000000007, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False) 0.547593037371


In [289]:

parameters = {"penalty": ('l2',), 'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag'], 'C': np.arange(0.1, 2., 0.5) }
algo =  LogisticRegression()
regress = GridSearchCV(algo, parameters, scoring=roc_auc, n_jobs=-1)
regress.fit(train_X, train_Y)
print(regress.best_estimator_, regress.best_score_)



LogisticRegression(C=0.10000000000000001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False) 0.5


С is_ends и in_lower было  0.635027557937

In [321]:
def length(row):
    return float(len(row['word']))

train['length'] = train.apply(length, axis=1)
test['length'] = test.apply(length, axis=1)


всё: 0.640223596808
без in_lower: 0.640312091945
без is_upper, in_lower: 0.629210228865
без is_sur,in_lower: 0.510390010954
без is_nown, in_lower: 0.629019440574
без is_ends, in_lower: 0.638600209341


In [322]:
endings = list([dict(), dict(), dict(), dict(), dict()])
correct_ending = list([dict(), dict(), dict(), dict(), dict()])

def count_ending(row):
    st = row['word']
    label = row['ans']
    for i in range(1, min(5, len(st))):
        if st[-i:] not in endings[i].keys():
            endings[i][st[-i:]] = 0
        endings[i][st[-i:]] += 1
        if label == 1:
            if st[-i:] not in correct_ending[i].keys():
                correct_ending[i][st[-i:]] = 0
            correct_ending[i][st[-i:]] += 1
            

for i in train.index:
    count_ending(train.loc[i])

In [323]:
def end_prob(row, i):
    st = row['word']
    if len(st) > i:
        if (st[-i:]) in correct_ending[i].keys():
            if (endings[i][st[-i:]] > 0):
                return float(correct_ending[i][st[-i:]])/ endings[i][st[-i:]]
    return 0
        
for i in range(1, 5):
    train['end_'+str(i)] = train.apply(lambda x: end_prob(x, i), axis=1)
    test['end_' + str(i)] = test.apply(lambda x: end_prob(x, i), axis=1)


In [324]:
amount = len(train.loc[train['ans']==1])
def prob(row , i):
    st = row['word']
    if len(st) > i:
        if (st[-i:]) in correct_ending[i].keys():
            if (endings[i][st[-i:]] > 0):
                return float(correct_ending[i][st[-i:]])/amount
    return 0
        
for i in range(1, 5):
    train['prob_'+str(i)] = train.apply(lambda x: prob(x, i), axis=1)
    test['prob_' + str(i)] = test.apply(lambda x: prob(x, i), axis=1)


In [325]:
train_X = train.drop('word', axis=1)
train_X = train_X.drop('ans', axis=1)
# train_X = train_X.drop('in_lower', axis=1)
# train_X = train_X.drop('is_ends', axis=1)

train_Y = train['ans']

parameters = {"penalty": ('l1',), 'C': np.arange(0.1, 1., 0.1), 'class_weight':('balanced',) }
algo =  LogisticRegression()
regress = GridSearchCV(algo, parameters, scoring=roc_auc, n_jobs=-1)
regress.fit(train_X, train_Y)
print(regress.best_estimator_, regress.best_score_)


LogisticRegression(C=0.10000000000000001, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False) 0.873676946358


In [329]:
train_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101408 entries, 0 to 101407
Data columns (total 14 columns):
is_upper    101408 non-null float64
in_lower    101408 non-null int64
is_ends     101408 non-null float64
is_sur      101408 non-null float64
is_nown     101408 non-null float64
length      101408 non-null float64
end_1       101408 non-null float64
end_2       101408 non-null float64
end_3       101408 non-null float64
end_4       101408 non-null float64
prob_1      101408 non-null float64
prob_2      101408 non-null float64
prob_3      101408 non-null float64
prob_4      101408 non-null float64
dtypes: float64(13), int64(1)
memory usage: 10.8 MB


In [335]:
test_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188920 entries, 0 to 188919
Data columns (total 14 columns):
is_upper    188920 non-null float64
in_lower    188920 non-null int64
is_ends     188920 non-null float64
is_sur      188920 non-null float64
is_nown     188920 non-null float64
length      188920 non-null float64
end_1       188920 non-null float64
end_2       188920 non-null float64
end_3       188920 non-null float64
end_4       188920 non-null float64
prob_1      188920 non-null float64
prob_2      188920 non-null float64
prob_3      188920 non-null float64
prob_4      188920 non-null float64
dtypes: float64(13), int64(1)
memory usage: 20.2 MB


In [338]:
print(regress.best_score_)
test_X = test.drop('word', axis=1)
# test_X = test_X.drop('ans', axis=1)

model = regress.best_estimator_
preds = model.predict(test_X.values)
test['answer'] = preds

test['answer'].to_csv("my_submission_4.tsv", sep=',', index=True)

0.873676946358


In [286]:
train_X = train.drop('word', axis=1)
train_X = train_X.drop('ans', axis=1)
# train_X = train_X.drop('in_lower', axis=1)
train_Y = train['ans']

parameters = {"penalty": ('l1',), 'C': (0.9,) }
algo =  LogisticRegression()
regress = GridSearchCV(algo, parameters, scoring=roc_auc, n_jobs=-1)
regress.fit(train_X, train_Y)
print(regress.best_estimator_, regress.best_score_)


LogisticRegression(C=0.9, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) 0.54704441952


всё:0.77

без всего, только prob: 0.54

c end_1, end_2, end_3: 0.71682928529

без end_1: 0.716729782985

без end_3: 0.664866548441


In [294]:
from sklearn.svm import SVC, LinearSVC


In [296]:
train_X = train.drop('word', axis=1)
train_X = train_X.drop('ans', axis=1)
# train_X = train_X.drop('in_lower', axis=1)

train_Y = train['ans']

parameters = {"penalty": ('l2',), 'loss' : ['hinge', 'squared_hinge'], 'C': np.arange(0.1, 2., 0.5) }
algo =  LinearSVC()
regress = GridSearchCV(algo, parameters, scoring=roc_auc, n_jobs=-1)
regress.fit(train_X, train_Y)
print(regress.best_estimator_, regress.best_score_)


LinearSVC(C=1.1000000000000001, class_weight=None, dual=True,
     fit_intercept=True, intercept_scaling=1, loss='hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0) 0.751338867848


In [298]:
train_X = train.drop('word', axis=1)
train_X = train_X.drop('ans', axis=1)
# train_X = train_X.drop('in_lower', axis=1)

train_Y = train['ans']

parameters = {"penalty": ('l2',), 'loss' : ['hinge', 'squared_hinge'], 'C': np.arange(0.1, 2., 0.1) }
algo =  LinearSVC()
regress = GridSearchCV(algo, parameters, scoring=roc_auc, n_jobs=-1)
regress.fit(train_X, train_Y)
print(regress.best_estimator_, regress.best_score_)


LinearSVC(C=1.2000000000000002, class_weight=None, dual=True,
     fit_intercept=True, intercept_scaling=1, loss='hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0) 0.752788212323


In [299]:
train_X = train.drop('word', axis=1)
train_X = train_X.drop('ans', axis=1)
# train_X = train_X.drop('in_lower', axis=1)

train_Y = train['ans']

parameters = { 'C': np.arange(0.1, 2., 0.5)}
algo =  SVC()
regress = GridSearchCV(algo, parameters, scoring=roc_auc, n_jobs=-1)
regress.fit(train_X, train_Y)
print(regress.best_estimator_, regress.best_score_)


SVC(C=1.6000000000000001, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 0.765199108514


In [305]:
test_X = test.drop('word', axis=1)
test_X = test_X.drop('ans', axis=1)

test_X = test_X.drop('is_ends', axis=1)
test_X = test_X.drop('in_lower', axis=1)
test_X = test_X.drop('is_sur', axis=1)
test_X = test_X.drop('is_nown', axis=1)

model = regress.best_estimator_
preds = model.predict(test_X.values)
test_X['ans'] = preds

test['ans'].to_csv("my_submission_3.tsv", sep=',', index=True)