In [95]:
import os
from pathlib import Path
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

In [96]:
cwd = Path(os.getcwd())
mainpath = cwd.parents[0] / "data/raw"
languagepaths = [x for x in mainpath.iterdir() if x.is_dir()]
languages = [x.name for x in languagepaths]

In [97]:
full_df = pd.DataFrame()

fieldnames = ['hit_id', 'sentence', 'start_offset', 'end_offset', 'target_word', 'native_annots', 
              'nonnative_annots', 'native_complex', 'nonnative_complex', 'gold_label', 'gold_prob']

relevant_cols = ['sentence', 'target_word', 'gold_label']

for lang in languagepaths:
    for datasource in lang.iterdir():
        source_data_type = datasource.stem.split('_')
        source = source_data_type[0]
        data_type = source_data_type[1]
        print(lang.name, source, data_type)
        source_df = pd.read_csv(datasource, sep='\t', header=None, names=fieldnames)
        relevant_df = source_df[relevant_cols]
        
        relevant_df.is_copy = False
        
        relevant_df['lang'] = lang.name
        relevant_df['source'] = source
        relevant_df['data_type'] = data_type
        
        
        full_df = full_df.append(relevant_df, ignore_index=True)
        

english News Dev
english News Test
english News Train
english WikiNews Dev
english WikiNews Test
english WikiNews Train
english Wikipedia Dev
english Wikipedia Test
english Wikipedia Train
french French Test
german German Dev
german German Test
german German Train
spanish Spanish Dev
spanish Spanish Test
spanish Spanish Train


In [98]:
full_df.loc[full_df['lang'] == 'spanish'].head()

Unnamed: 0,sentence,target_word,gold_label,lang,source,data_type
45035,Los Bronces de Riace conocidos también como Lo...,Los Bronces de Riace,1,spanish,Spanish,Dev
45036,Los Bronces de Riace conocidos también como Lo...,Bronces,1,spanish,Spanish,Dev
45037,Los Bronces de Riace conocidos también como Lo...,Riace,1,spanish,Spanish,Dev
45038,Los Bronces de Riace conocidos también como Lo...,griegas,1,spanish,Spanish,Dev
45039,Los Bronces de Riace conocidos también como Lo...,conocidos,0,spanish,Spanish,Dev


In [100]:
target = full_df

In [101]:
def get_sent_len_words(sentence):
    return len(sentence.split())

def get_sent_len_chars(sentence):
    return len(sentence)

def get_sent_len_chars_avg(sentence):
    return len(sentence)/len(sentence.split())

def get_num_target_words(words):
    result = len(words.split(' '))
    return result

def get_avg_word_len(words):
    num_words = len(words.split(' '))
    total_len_char = len(words)
    
    # Removing the spaces
    num_spaces = num_words - 1
    word_chars = total_len_char - num_spaces
    
    result = word_chars/num_words
    return result
        

In [102]:
target.is_copy = False
target['sent_len_w'] = target.apply(func= lambda row : get_sent_len_words( row['sentence'] ) , axis=1)
target['sent_len_c'] = target.apply(func= lambda row : get_sent_len_chars( row['sentence'] ) , axis=1)
target['sent_len_c_avg'] = target.apply(func= lambda row : get_sent_len_chars_avg( row['sentence'] ) , axis=1)
target['avg_target_len'] = target.apply(func= lambda row : get_avg_word_len( row['target_word'] ) , axis=1)
target['num_target_w'] = target.apply(func= lambda row :  get_num_target_words( row['target_word'] ) , axis=1)

target.head()

Unnamed: 0,sentence,target_word,gold_label,lang,source,data_type,sent_len_w,sent_len_c,sent_len_c_avg,avg_target_len,num_target_w
0,Syrian troops shelled a rebel-held town on Mon...,troops,0,english,News,Dev,22,155,7.045455,6.0,1
1,Syrian troops shelled a rebel-held town on Mon...,Syrian,0,english,News,Dev,22,155,7.045455,6.0,1
2,Syrian troops shelled a rebel-held town on Mon...,shelled,1,english,News,Dev,22,155,7.045455,7.0,1
3,Syrian troops shelled a rebel-held town on Mon...,rebel-held,1,english,News,Dev,22,155,7.045455,10.0,1
4,Syrian troops shelled a rebel-held town on Mon...,sparking,1,english,News,Dev,22,155,7.045455,8.0,1


In [103]:
target.describe()

Unnamed: 0,gold_label,sent_len_w,sent_len_c,sent_len_c_avg,avg_target_len,num_target_w
count,62640.0,62640.0,62640.0,62640.0,62640.0,62640.0
mean,0.405045,29.454406,176.769987,6.053152,7.111901,1.196903
std,0.490905,15.647834,92.049889,0.693742,2.645286,0.626468
min,0.0,1.0,3.0,2.5,1.0,1.0
25%,0.0,19.0,115.0,5.6,5.0,1.0
50%,0.0,27.0,160.0,6.022989,7.0,1.0
75%,1.0,36.0,217.0,6.470588,9.0,1.0
max,1.0,124.0,843.0,10.0,29.0,11.0


In [109]:
train = target.loc[target['data_type'] == 'Train']
test = target.loc[target['data_type'] == 'Test']
dev = target.loc[target['data_type'] == 'Dev']

In [110]:
numeric_cols = target._get_numeric_data()

# Printing simple correlations
for colname in numeric_cols:
    if colname != 'gold_label':
        print("{}\t{:9.5f}".format(colname, target['gold_label'].corr(target[colname])))

# target['gold_label'].corr(target['B'])

sent_len_w	 -0.03179
sent_len_c	 -0.02128
sent_len_c_avg	  0.05917
avg_target_len	  0.31841
num_target_w	  0.28370


In [119]:
#TODO: Check this is correct. Is gold label of 1 Complex or Non-complex?
class_labels = ['Complex', 'Non-Complex']

train_data = train._get_numeric_data()
dev_data = dev._get_numeric_data()
test_data = test._get_numeric_data()

# Getting the train_features
train_features = train_data.drop('gold_label', axis=1)
train_feature_names = train_features.columns

train_ys = train_data['gold_label'].values
train_Xs = train_features.values

# Getting the dev_features
dev_features = dev_data.drop('gold_label', axis=1)
dev_feature_names = dev_features.columns

dev_ys = dev_data['gold_label'].values
dev_Xs = dev_features.values

#Getting the test_features
test_features = test_data.drop('gold_label', axis=1)
test_feature_names = test_features.columns

test_ys = test_data['gold_label'].values
test_Xs = test_features.values

In [148]:
print("train_Xs:\t{}\ntrain_ys:\t{}\ndev_Xs:\t\t{}\ndev_ys:\t\t{}\ntest_Xs:\t{}\ntest_ys:\t{}\n".format(str(train_Xs.shape), str(train_ys.shape), str(dev_Xs.shape), str(dev_ys.shape), str(test_Xs.shape), str(test_ys.shape)))

train_Xs:	(47200, 5)
train_ys:	(47200,)
dev_Xs:		(5745, 5)
dev_ys:		(5745,)
test_Xs:	(9695, 5)
test_ys:	(9695,)



In [203]:
# Normalizing the columns:

# Can't do this because we can't propagate the normalization factor as far as I can tell, so we're doing it a simpler way.
# train_Xs_norm = tf.keras.utils.normalize(train_Xs, axis=-1,order=2)
# dev_Xs_norm = tf.keras.utils.normalize(dev_Xs, axis=-1,order=2)
# test_Xs_norm = tf.keras.utils.normalize(test_Xs, axis=-1,order=2)

max_vals = np.max(train_Xs, axis=0)
min_vals = np.min(train_Xs, axis=0)
norm_factor = 1 / (max_vals - min_vals)
train_Xs_norm = norm_factor * (train_Xs - max_vals) + 1
dev_Xs_norm = norm_factor * (dev_Xs - max_vals) + 1
test_Xs_norm = norm_factor * (test_Xs - max_vals) + 1


train_ys_cat = keras.utils.to_categorical(train_ys)
dev_ys_cat = keras.utils.to_categorical(dev_ys)
test_ys_cat = keras.utils.to_categorical(test_ys)

In [207]:
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(5,)),
    keras.layers.Dense(128, activation=tf.nn.relu),
    keras.layers.Dense(128, activation=tf.nn.relu),
    keras.layers.Dense(2, activation=tf.nn.softmax)
])



model.compile(optimizer=tf.train.AdamOptimizer(), 
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [208]:
model.fit(train_Xs_norm, train_ys_cat, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras._impl.keras.callbacks.History at 0x207c669fa58>

In [209]:
test_loss, test_acc = model.evaluate(test_Xs_norm, test_ys_cat)

print('Test accuracy:', test_acc)

Test accuracy: 0.7322331098750303
