In [None]:
import pandas as pd
import datetime as dt
from functools import wraps
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import tree
import graphviz 


In [None]:
df.head()

In [None]:
# Make sure that there are not repeated words
assert len(set(df.woord)) == df.shape[0]

Build feature based on hand crafted features

### Base line features
* How many characters does it have ?
* Is there dash in the word?
* Does it end with 'en' or 'jes' then it is plural

### Additional features
* Is the word singular or plural?
* Is it a place, organization  or person ? 
* Is the word made of more than one word ?

In [None]:

def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"just ran step {func.__name__} shape={result.shape} took {time_taken}s")
        return result
    return wrapper



@log_step
def start_pipeline(dataf):
    return dataf.copy() 

@log_step
def select_rows_by_column_values(dataf,column,values):
    return (dataf[dataf[column].isin(values)])

@log_step
def select_columns(dataf, columns):
    return (dataf[columns])

@log_step
def count_characters(dataf,column_name='woord'):
    dataf['character_count'] = dataf[column_name].apply(lambda x : len(x))
    return dataf

@log_step
def has_dash(dataf,column_name='woord'):
    dataf['has_dash'] = dataf[column_name].apply(lambda x :'-' in x)
    return dataf
    

@log_step
def ends_with_en(dataf,column_name='woord'):
    dataf['ends_with_en'] = dataf[column_name].apply(lambda x :'en' in x[-2:])
    return dataf

@log_step
def ends_with_jes(dataf,column_name='woord'):
    dataf['ends_with_jes'] = dataf[column_name].apply(lambda x :'jes' in x[-3:])
    return dataf



def generate_baseline_features(df):
    return (
            df.pipe(select_columns,columns=['woord','det']).\
                pipe(start_pipeline).\
                pipe(select_rows_by_column_values,column='det',values=['de','het']).\
                pipe(count_characters).\
                pipe(has_dash).\
                pipe(ends_with_en).\
                pipe(ends_with_jes)
        )

In [None]:
df_baseline=generate_baseline_features(df)

In [None]:
df_baseline.det.value_counts()

In [None]:
17579-5697

The data is highly skewed. Upsample the minority class

In [None]:
from sklearn.utils import resample
# Separate majority and minority classes
df_majority = df_baseline[df_baseline.det=='de']
df_minority = df_baseline[df_baseline.det=='het']
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=17579,    # to match majority class
                                 random_state=123) # reproducible results

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.det.value_counts()

In [None]:
df_upsampled.head()

## Train a simple decision tree

In [None]:
# Extract the name of features. This can be done by removing the column 'woord' and 'det'
feature_names = list(set(df_upsampled.columns.to_list()) - set(['woord','det']))
X = df_upsampled.pipe(select_columns,columns=feature_names)
y = df_upsampled['det']

In [None]:
clf = DecisionTreeClassifier(random_state=0,max_depth=2,max_features='auto',max_leaf_nodes=2)

In [None]:
cross_val_score(clf, X, y, cv=20)

In [None]:
clf = clf.fit(X, y)

In [None]:
 dot_data = tree.export_graphviz(clf, out_file=None, 
    feature_names=feature_names,  
    class_names=y.unique().tolist(),
    filled=True, rounded=True,  
    special_characters=True)  

In [None]:
graph = graphviz.Source(dot_data)  

In [None]:
graph