In [1]:
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools

## 1. Get the Data 

In [3]:
train = pd.read_csv('data/train.csv', names=['Word', 'Label'])
test = pd.read_csv('data/test.csv', names=['Word'])

## 2. Feature Engineering

In [4]:
surname_endings = [
    "ба", "уа", "ипа", "заде", "ли","лы", "оглу", "кызы", "ян", \
    "янц", "уни", "ич", "ов", "ук", "ик", "ски", "ка", "ев", "ов",\
    "огло", "пулос", "кос", "иди", "швили", "дзе", "ури", "иа",\
    "уа", "ава" "ли", "си", "ни", "ини", "те", "ис", "не", \
    "ску", "ан", "ул", "ын", "ин", "шкин", "кин", "ман", "ер"\
    "ти", "ски", "цки", "дзки", "ских", "ич", "джи", "оглу", "ин",\
    "ишин", "ко", "ук" "юк", "ун", "ний" "ный", "чай", "ий",\
    "ский", "цкий", "ская", "цкая"
]

# surname_endings = ['шкин']

for i, el in enumerate(surname_endings):
    surname_endings[i] = ".*" + surname_endings[i] + ".?.?$"

In [5]:
labels_for_remove = []

In [6]:
import re

def countDoubles(word):
    l = [let for let in word.lower()]
    return len([(x,y) for x,y in itertools.izip(l, l[1:]) if x == y])

def build_features(data):
    for i, substr in enumerate(surname_endings):
        data['has_re_end_{}'.format(i)] = data['Word'].apply(lambda x: re.match(substr, x, re.UNICODE) is not None)
        labels_for_remove.append('has_re_end_{}'.format(i))
    
    for i, substr in enumerate(['^Дж.*']):
        data['has_start_Dj{}'.format(i)] = data['Word'].apply(lambda x: re.match(substr, x, re.UNICODE) is not None)
        
    data['Word'] = data['Word'].apply(lambda word: word.decode('utf-8'))
    data['Length'] = data['Word'].apply(lambda word: len(word))
    
    vowels = [ 'а',  'я', 'ё', 'у','е', 'о', 'э', 'ю', 'и', 'ы', 'Ё', 'У', 'Е', 'Ы','А', 'О', 'Э', 'Ю', 'И', 'Я' ]
    vowels = [let.decode('utf-8') for let in vowels]
    data['Vowels'] = data['Word'].apply(lambda word: len([letter for letter in word if letter in vowels]))
    data['Consonants'] = data['Length'] - data['Vowels']
    
    data['is_lower'] = data['Word'].apply(lambda word: int( word[0] >= 'а'.decode('utf-8')))
    
    data['Double'] = data['Word'].apply(lambda word: countDoubles(word))
    data['Caps'] = data['Word'].apply(lambda x: x.upper() == x)
    data['Frac_vowels'] = data['Vowels'].apply(float) / data['Length']
    for i, substr in enumerate(['сон', 'ы', 'э', 'щ', 'ъ', 'й', 'ф']):
        s = substr.decode('utf-8')
        data['has_{}'.format(i)] = data['Word'].apply(lambda x: x.find(s) >= 0)
        
    data.fillna(0)

In [7]:
build_features(train)
build_features(test)

In [8]:
train['Ends'] = train[labels_for_remove].sum(axis=1).apply(lambda x: x > 0)
test['Ends'] = test[labels_for_remove].sum(axis=1).apply(lambda x: x > 0)

### N-gramms

In [5]:
import marisa_trie
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.externals import six

class MarisaCountVectorizer(CountVectorizer):

    # ``CountVectorizer.fit`` method calls ``fit_transform`` so
    # ``fit`` is not provided
    def fit_transform(self, raw_documents, y=None):
        X = super(MarisaCountVectorizer, self).fit_transform(raw_documents)
        X = self._freeze_vocabulary(X)
        return X

    def _freeze_vocabulary(self, X=None):
        if not self.fixed_vocabulary_:
            frozen = marisa_trie.Trie(six.iterkeys(self.vocabulary_))
            if X is not None:
                X = self._reorder_features(X, self.vocabulary_, frozen)
            self.vocabulary_ = frozen
            self.fixed_vocabulary_ = True
            del self.stop_words_
        return X

    def _reorder_features(self, X, old_vocabulary, new_vocabulary):
        map_index = np.empty(len(old_vocabulary), dtype=np.int32)
        for term, new_val in six.iteritems(new_vocabulary):
            map_index[new_val] = old_vocabulary[term]
        return X[:, map_index]

In [301]:
ngram_vectorizer = MarisaCountVectorizer(
    analyzer='char_wb', 
    ngram_range=(2, 6), 
    min_df=1, 
    max_features=250000,
    lowercase=False
) 
ngram_train = ngram_vectorizer.fit_transform(train.Word)
ngram_test = ngram_vectorizer.transform(test.Word)

In [270]:
ngram_train.shape

(101408, 250000)

In [271]:
train[['Word', 'Label']][:5]

Unnamed: 0,Word,Label
0,Аалтонен,1
1,Аар,0
2,Аарон,0
3,ААРОН,0
4,Аарона,0


## 3. Cross validation 

In [272]:
from sklearn.cross_validation import train_test_split

xtrain, xcv, ytrain, ycv = train_test_split(ngram_train, train['Label'], test_size = 0.1,  random_state = 10)

#### a) Try Logistic Regression 

In [332]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(
    penalty='l2',
    C=0.45,
    fit_intercept=False,
    solver='lbfgs',
    multi_class='ovr'
)

lr.fit(xtrain, ytrain)
predictionLR = lr.predict_proba(xcv)

In [333]:
from sklearn.metrics import roc_auc_score

roc_auc_score(ycv, predictionLR[:,1])

0.92774537770233123

The best is: 0.92774537770233123

#### b) Try Random Forest 

In [313]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 10)
rf.fit(xtrain, ytrain)
predictionRF = rf.predict_proba(xcv)

roc_auc_score(ycv, predictionRF[:,1])

0.85567480931027162

## 4. Make prediction 

In [302]:
lr = LogisticRegression(
    penalty='l2',
    C=0.45,
    fit_intercept=False,
    random_state=42,
    solver='lbfgs',
    multi_class='ovr'
)

lr.fit(ngram_train, train['Label'])
submitLR = lr.predict_proba(ngram_test)

In [303]:
submit = pd.DataFrame()
submit['Id'] = xrange(len(test))
submit['Answer'] = submitLR[:, 1]
submit[:5]

Unnamed: 0,Id,Answer
0,0,0.431873
1,1,0.318784
2,2,0.170162
3,3,0.079178
4,4,0.168515


In [306]:
submit.to_csv('submit.csv', index = False)