# Classifier for CEFR Levels

In [1]:
from theano.sandbox import cuda

In [2]:
from keras.models import Sequential
from keras.layers import Embedding, Dense, Convolution1D, MaxPooling1D, Dropout, Flatten, BatchNormalization

Using Theano backend.


In [49]:
import numpy as np
import string
import cPickle as pickle
import re
import pandas as pd
import itertools
import bcolz
import os
from keras.utils.np_utils import to_categorical
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences

This path can be adjusted as needed; I just dumped the 'sentences' folder from the unzipped file from Giuseppe and put it in the /data directory.

In [13]:
data_path = 'data/sentences/'

In [5]:
%matplotlib inline

In [6]:
model_path = 'data/models'

In [17]:
cefr_levels = ['a1','a2','b1','b2']

## Sort data into train, valid, and test sets

In [48]:
%pwd

u'/home/ubuntu/projects/cefr/data'

In [53]:
%cd data

/home/ubuntu/projects/cefr


In [50]:
%mkdir train
%mkdir valid
%mkdir test

In [82]:
train_path = 'data/train/'
valid_path = 'data/valid/'
test_path =  'data/test/'

#### Adjust what share of the data you want allocated to each subset.

In [83]:
train_share = 0.6
valid_share = 0.2
test_share = 0.2
assert train_share + valid_share + test_share == 1

#### Read the data and randomly sort into subsets.
After sorting, arrays are saved to file (w/Pickle) so this step only has to be done once.

In [100]:
base_file = '_sentences.txt'
train = np.empty((1,2),dtype='object')
valid = np.empty((1,2),dtype='object')
test  = np.empty((1,2),dtype='object')
for l in range(len(cefr_levels)):
    level = []
    with open(data_path + cefr_levels[l] + base_file,'r') as f:
        for line in f:
            raw = line.strip().decode('unicode_escape').encode('ascii','ignore')
            no_punctuation = raw.translate(None, string.punctuation)
            words = no_punctuation.split(' ')
            level.append((words,l))
    count = len(level)
    print(cefr_levels[l],count)
    train_idx = int(train_share * count)
    valid_idx = int(valid_share * count) + train_idx
    shuf = np.random.permutation(level)
    train = np.concatenate((train, shuf[:train_idx]))
    valid = np.concatenate((valid, shuf[train_idx:valid_idx]))
    test  = np.concatenate((test,  shuf[valid_idx:]))

pickle.dump(train, open(train_path + "sentences.p", 'wb'))  
pickle.dump(valid, open(valid_path + "sentences.p", 'wb'))
pickle.dump(test,  open(test_path + "sentences.p", 'wb'))


('a1', 2487)
(1492, 1989)
('a2', 5110)
(3066, 4088)
('b1', 6371)
(3822, 5096)
('b2', 4806)
(2883, 3844)


### Load and process data

In [101]:
train_file = pickle.load(open(train_path + 'sentences.p','rb'))
valid_file = pickle.load(open(valid_path + 'sentences.p','rb'))
test_file  = pickle.load(open(test_path + 'sentences.p','rb'))

train, train_labels = zip(*train_file)
valid, valid_labels = zip(*valid_file)
test, test_labels   = zip(*test_file)

In [102]:
print(len(train),len(valid),len(test))

(11264, 3755, 3758)


In [103]:
print(len(train_labels),len(valid_labels),len(test_labels))

(11264, 3755, 3758)


In [106]:
print(train[2500])

['What', 'shall', 'I', 'do', 'Benny', 'asked', 'his', 'grandfather']


In [107]:
print(train_labels[2500])

1


Use one-hot encoding on the labels:

In [108]:
train_labels = to_categorical(train_labels)
valid_labels = to_categorical(valid_labels)
test_labels = to_categorical(test_labels)

In [109]:
train_labels[2500]

array([ 0.,  1.,  0.,  0.])