In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import scorer
from collections import Counter
import xgboost
import matplotlib.pyplot as plt
from sklearn import cross_validation, grid_search 
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.cross_validation import cross_val_score
from tqdm import tqdm
from sklearn.utils import shuffle
from gensim.models import word2vec
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline



In [2]:
data = pd.read_csv("linear_train.txt", header=None)
data_test = pd.read_csv("linear_test.txt", header=None)
data = data.rename(index=str, columns={0: "word", 1: "is_surname"})
data_test = data_test.rename(index=str, columns={0: "word"})
print(data.shape)
print(data_test.shape)

(101408, 2)
(188920, 1)


In [3]:
letters1 = ['а', 'о', 'и', 'е', 'ё', 'э', 'ы', 'у', 'ю', 'я']
letters2 = ['б', 'в', 'г', 'д', 'ж', 'з', 'й', 'к', 'л', 'м', 'н', 'п', 'р', 'с', 'т', 'ф', 'х', 'ц', 'ч', 'ш', 'щ']

In [4]:
data["is upper"] = [int(data["word"][i].upper() == data["word"][i]) for i in range(len(data["word"]))]
data["starts with upper"] = [int(data["word"][i][0] >= 'А' and data["word"][i][0] <= 'Я') 
                             for i in range(len(data["word"]))]
data["ends with l1"] = [int(data["word"][i][-1].lower() in letters1) for i in range(len(data["word"]))]
data.head(10)

Unnamed: 0,word,is_surname,is upper,starts with upper,ends with l1
0,Аалтонен,1,0,1,0
1,Аар,0,0,1,0
2,Аарон,0,0,1,0
3,ААРОН,0,1,1,0
4,Аарона,0,0,1,1
5,Аарона,1,0,1,1
6,Аароне,0,0,1,1
7,Ааронов,0,0,1,0
8,Аахена,0,0,1,1
9,Абабков,1,0,1,0


In [5]:
data_test["is upper"] = [int(data_test["word"][i].upper() == data_test["word"][i]) for i in range(len(data_test["word"]))]
data_test["starts with upper"] = [int(data_test["word"][i][0] >= 'А' and data_test["word"][i][0] <= 'Я') 
                             for i in range(len(data_test["word"]))]
data_test["ends with l1"] = [int(data_test["word"][i][-1].lower() in letters1) for i in range(len(data_test["word"]))]
data_test.head(10)

Unnamed: 0,word,is upper,starts with upper,ends with l1
0,Аалто,0,1,1
1,ААР,1,1,0
2,Аара,0,1,1
3,Ааре,0,1,1
4,Аарон,0,1,0
5,Аароне,0,1,1
6,Ааронов,0,1,0
7,Аароном,0,1,0
8,Аароном,0,1,0
9,Аарону,0,1,1


In [102]:
number_of_slogov = [0 for i in range(data.shape[0])]
for i in tqdm(range(len(data["word"]))):
    for l in letters1:
        if (l in data["word"][i].lower()):
            number_of_slogov[i] += 1
data["slogi"] = number_of_slogov
data["slogi_2"] = np.array(number_of_slogov) ** 2

number_of_slogov = [0 for i in range(data_test.shape[0])]
for i in tqdm(range(len(data_test["word"]))):
    for l in letters1:
        if (l in data_test["word"][i].lower()):
            number_of_slogov[i] += 1
data_test["slogi"] = number_of_slogov
data_test["slogi_2"] = np.array(number_of_slogov) ** 2

100%|██████████| 101408/101408 [00:17<00:00, 5864.64it/s]
100%|██████████| 188920/188920 [00:33<00:00, 5706.38it/s]


In [8]:
for l1 in tqdm(letters1):
    for l2 in letters1:
        pairs = [int(l1 + l2 in data["word"][i].lower()) for i in range(data.shape[0])]
        pairs_test = [int(l1 + l2 in data_test["word"][i].lower()) for i in range(data_test.shape[0])]
        if (sum(pairs) > 120):
            data[l1+l2] = pairs
            data_test[l1+l2] = pairs_test

100%|██████████| 10/10 [08:10<00:00, 48.76s/it]


In [9]:
for l1 in tqdm(letters1):
    for l2 in letters2:
        pairs = [int(l1 + l2 in data["word"][i].lower()) for i in range(data.shape[0])]
        pairs_test = [int(l1 + l2 in data_test["word"][i].lower()) for i in range(data_test.shape[0])]
        if (sum(pairs) > 120):
            data[l1+l2] = pairs
            data_test[l1+l2] = pairs_test

100%|██████████| 10/10 [17:09<00:00, 102.60s/it]


In [10]:
for l1 in tqdm(letters2):
    for l2 in letters1:
        pairs = [int(l1 + l2 in data["word"][i].lower()) for i in range(data.shape[0])]
        pairs_test = [int(l1 + l2 in data_test["word"][i].lower()) for i in range(data_test.shape[0])]
        if (sum(pairs) > 120):
            data[l1+l2] = pairs
            data_test[l1+l2] = pairs_test

100%|██████████| 21/21 [16:59<00:00, 48.61s/it]


In [11]:
for l1 in tqdm(letters2):
    for l2 in letters2:
        pairs = [int(l1 + l2 in data["word"][i].lower()) for i in range(data.shape[0])]
        pairs_test = [int(l1 + l2 in data_test["word"][i].lower()) for i in range(data_test.shape[0])]
        if (sum(pairs) > 120):
            data[l1+l2] = pairs
            data_test[l1+l2] = pairs_test

100%|██████████| 21/21 [35:39<00:00, 101.59s/it]


In [12]:
print(data.shape)
data.head()

(101408, 461)


Unnamed: 0,word,is_surname,is upper,starts with upper,ends with l1,slogi,аи,ае,ау,ая,...,хн,хр,хт,чк,чн,чт,шк,шл,шн,шт
0,Аалтонен,1,0,1,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Аар,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Аарон,0,0,1,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ААРОН,0,1,1,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Аарона,0,0,1,1,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
print(data_test.shape)
data.head()

(188920, 460)


Unnamed: 0,word,is_surname,is upper,starts with upper,ends with l1,slogi,аи,ае,ау,ая,...,хн,хр,хт,чк,чн,чт,шк,шл,шн,шт
0,Аалтонен,1,0,1,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Аар,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Аарон,0,0,1,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ААРОН,0,1,1,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Аарона,0,0,1,1,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
res = []
for i in range(data.shape[0]):
    if (len(data["word"][i]) > 1):
        res.append(data["word"][i][-2].lower() + data["word"][i][-1].lower())

In [15]:
frequences = []
tmp = Counter(res)
for key in tmp.keys():
    frequences.append((tmp[key], key))
frequences = sorted(frequences,reverse=True)
frequences = frequences[:60]
frequences

[(5359, 'ом'),
 (5194, 'ми'),
 (4073, 'ов'),
 (3193, 'ия'),
 (3168, 'ам'),
 (2899, 'ка'),
 (2868, 'ах'),
 (2695, 'ки'),
 (2614, 'ие'),
 (2307, 'ой'),
 (2235, 'ии'),
 (2230, 'ем'),
 (1891, 'ию'),
 (1836, 'ей'),
 (1774, 'ку'),
 (1623, 'на'),
 (1580, 'ть'),
 (1450, 'ке'),
 (1323, 'ра'),
 (1287, 'та'),
 (1242, 'ий'),
 (1195, 'ти'),
 (1069, 'ты'),
 (1022, 'ью'),
 (978, 'ям'),
 (977, 'не'),
 (956, 'ва'),
 (955, 'ну'),
 (950, 'ях'),
 (928, 'ры'),
 (928, 'ик'),
 (856, 'ок'),
 (806, 'ны'),
 (773, 'ру'),
 (763, 'ту'),
 (690, 'ер'),
 (684, 'ре'),
 (670, 'те'),
 (650, 'ль'),
 (632, 'цы'),
 (626, 'да'),
 (620, 'са'),
 (593, 'ца'),
 (588, 'ма'),
 (560, 'ли'),
 (545, 'он'),
 (520, 'му'),
 (512, 'ле'),
 (510, 'ла'),
 (505, 'ин'),
 (503, 'ев'),
 (496, 'ву'),
 (447, 'во'),
 (422, 'ды'),
 (420, 'ля'),
 (400, 'ве'),
 (388, 'ор'),
 (384, 'су'),
 (382, 'ду'),
 (376, 'де')]

In [28]:
data_test.shape[0]

188920

In [29]:
for element in tqdm(frequences):
    cur_arr = [0 for i in range(data.shape[0])]
    cur_arr_test = [0 for i in range(data_test.shape[0])]
    for i in range(data.shape[0]):
        if (element[1] == data["word"][i].lower()[-2:]):
            cur_arr[i] = 1
    for i in range(data_test.shape[0]):
        if (element[1] == data_test["word"][i].lower()[-2:]):
            cur_arr_test[i] = 1
    data[element[1]+"_end"] = cur_arr
    data_test[element[1]+"_end"] = cur_arr_test

100%|██████████| 60/60 [05:09<00:00,  4.97s/it]


In [30]:
print (data.shape)
print (data_test.shape)

(101408, 521)
(188920, 520)


In [105]:
X = data.drop(["word","is_surname"], axis=1)
y = data["is_surname"]

In [106]:
X_test = data_test.drop(["word"], axis=1)

In [107]:
print(X.shape)
print(X_test.shape)

(101408, 520)
(188920, 520)


In [203]:
model = SGDClassifier()

In [44]:
my_cv = StratifiedShuffleSplit(y, n_iter = 5, random_state = 0)

In [205]:
parameters_grid = {
    'loss': ['log'],
    'penalty': ['l1', 'l2'],
    'alpha': [0.0001, 0.00005, 0.0002],
    'l1_ratio': [0.035, 0.04, 0.045, 0.05],
}

In [107]:
grid_cv = grid_search.GridSearchCV(model, parameters_grid, scoring = scorer.roc_auc_scorer, cv = my_cv)

In [101]:
%%time
grid_cv.fit(X, y)

CPU times: user 7min 2s, sys: 1min 47s, total: 8min 49s
Wall time: 8min 57s


GridSearchCV(cv=StratifiedShuffleSplit(labels=[1 0 ..., 0 0], n_iter=10, test_size=0.3, random_state=0),
       error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': [0.0001, 5e-05, 0.0002], 'l1_ratio': [0.035, 0.04, 0.045, 0.05], 'loss': ['log'], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=make_scorer(roc_auc_score, needs_threshold=True), verbose=0)

In [102]:
print (grid_cv.best_estimator_)
print (grid_cv.best_score_)
print (grid_cv.best_params_)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.04,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
0.8789510652527434
{'alpha': 0.0001, 'l1_ratio': 0.04, 'loss': 'log', 'penalty': 'l2'}


In [41]:
model = SGDClassifier(alpha=0.0001, l1_ratio=0.04, loss='log', penalty='l2')

In [45]:
model = RandomForestClassifier(n_estimators=200, min_samples_leaf=3, min_samples_split=4)

In [86]:
%%time        
model = RandomForestClassifier(n_estimators=200, min_samples_leaf=4, min_samples_split=5)
res = cross_val_score(model, X, y, scoring=scorer.roc_auc_scorer, cv=my_cv)
print (res)
print (np.mean(res))

KeyboardInterrupt: 

In [108]:
model = RandomForestClassifier(n_estimators=120, min_samples_leaf=3, min_samples_split=4)

In [109]:
%%time
model.fit(X, y)

CPU times: user 1min 3s, sys: 507 ms, total: 1min 4s
Wall time: 1min 4s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=3,
            min_samples_split=4, min_weight_fraction_leaf=0.0,
            n_estimators=120, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [110]:
predictions = model.predict_proba(X_test)

In [111]:
predictions = predictions[:,1]

In [112]:
example = pd.read_csv("linear_ans_example.txt")
example.head()

Unnamed: 0,Id,Answer
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.0
4,4,0.0


In [113]:
example["Answer"] = predictions

In [114]:
example.head(25)

Unnamed: 0,Id,Answer
0,0,0.343442
1,1,0.140283
2,2,0.144631
3,3,0.105381
4,4,0.266849
5,5,0.210454
6,6,0.401724
7,7,0.345496
8,8,0.345496
9,9,0.346018


In [115]:
example.to_csv("submission3.txt", index=False)