In [85]:
import itertools
import numpy as np
import pandas as pd
from sklearn.cross_validation import cross_val_score
from sklearn import linear_model

In [24]:
data = pd.read_csv('data/export2.csv', sep='\t')
words = data[['word', 'valid']]

### Try Feature Engineering 

In [89]:
def countDoubles(word):
    l = [let for let in word.lower()]
    return len([(x,y) for x,y in itertools.izip(l, l[1:]) if x == y])

def countTriples(word):
    l = [let for let in word.lower()]
    return len([(x,y, z) for x, y, z in itertools.izip(l, l[1:], l[2:]) if x == y and y == z])

In [93]:
words['word'] = words.word.apply(lambda word: word.lower())
words['length'] = words.word.apply(lambda word: len(word))

vowels = ['a', 'e', 'i', 'o', 'u', 'y']
words['vowels'] = words.word.apply(lambda word: len([letter for letter in word if letter in vowels]))
words['consonants'] = words['length'] - words['vowels']

words['has_apos'] = words.word.apply(lambda x: ("'" in x))
words['count_apos'] = words.word.apply(lambda x: x.count("'"))
words['count_apos_>_1'] = words.word.apply(lambda x: x.count("'") > 1)

words['doubles'] = words.word.apply(lambda word: countDoubles(word))
words['triples'] = words.word.apply(lambda word: countTriples(word))

In [120]:
words[:5]

Unnamed: 0,word,valid,has_apos,count_apos,count_apos_>_1,length,vowels,consonants,doubles,triples
0,',False,True,1,False,1,0,1,0,0
1,'',False,True,2,True,2,0,2,1,0
2,'''xwmg'niwblrfczcrymryrk'n,False,True,5,True,27,3,24,2,1
3,''abwzluwxbksrwoijfyuhkn''l,False,True,4,True,27,6,21,2,0
4,''aosyncbbuhygp'fet,False,True,3,True,19,6,13,2,0


In [119]:
data[:5]

Unnamed: 0,word,valid,col0,col1,col2,col3,col4
0,',False,class4,class3,class2,class1,class0
1,'',False,class5,class3,class2,class1,class0
2,'''xwmg'niwblrfczcrymryrk'n,False,class4,class5,class5,class3,class1
3,''abwzluwxbksrwoijfyuhkn''l,False,class4,class5,class5,class3,class1
4,''aosyncbbuhygp'fet,False,class4,class5,class5,class3,class0


In [200]:
dum = pd.get_dummies(train, columns=train.columns)

In [201]:
dum.shape

(102986, 24)

### Try N-Grams 

In [185]:
from sklearn.feature_extraction.text import CountVectorizer

ngram_vectorizer = CountVectorizer(
    analyzer='char_wb',
    ngram_range=(1, 5),
    min_df=1
) 

ngram_data = ngram_vectorizer.fit_transform(data.word)

In [186]:
ngram_data.shape

(102986, 552998)

### Union data

In [197]:
target = data.valid.astype(int).values
# train = words[words.columns[2:]]
train = data[data.columns[2:]]
# train = pd.concat([train, data[data.columns[2:]]], axis=1)

In [198]:
target

array([0, 0, 0, ..., 0, 0, 0])

In [199]:
train[:5]

Unnamed: 0,col0,col1,col2,col3,col4
0,class4,class3,class2,class1,class0
1,class5,class3,class2,class1,class0
2,class4,class5,class5,class3,class1
3,class4,class5,class5,class3,class1
4,class4,class5,class5,class3,class0


In [202]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(
    C=0.05,
    penalty='l2', # 'l1'
    solver='lbfgs',  # 'newton-cg', 'lbfgs', 'liblinear', 'sag'
    fit_intercept=False,
    multi_class='ovr'
)

In [203]:
cross_val_score(lr, dum, target, cv=3).mean()

0.89577228188192537

### Scores:

Syllables + FE: 0.89665590099649817 <br>
Syllables clear: 0.89577228188192537 <br>
FE clear: 0.62845441706336602 <br>
1-grams: 0.60693684485933941 <br>
2-grams: 0.51982793759311707 <br>
3-grams: 0.54606455087492101 <br>
4-grams: 0.57074752730215528 <br>
5-grams: 0.57987500433819938 <br>
1-5-grams: 0.5726409908125395

### Afterwards: Try RF 

In [12]:
from sklearn.preprocessing import LabelEncoder
lbl_enc = LabelEncoder()

for c in train.columns:
    train[c] = lbl_enc.fit_transform(train[c])

In [107]:
from sklearn.cross_validation import train_test_split

xtr, xcv, ytr, ycv = train_test_split(train, target, test_size = 0.05,  random_state = 42)

In [108]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)
rf.fit(xtr, ytr)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [109]:
from sklearn.metrics import accuracy_score

predictions = rf.predict(xcv)
accuracy_score(ycv, predictions)

0.62213592233009707

0.62679611650485434 best