In [None]:
import numpy as np

In [None]:
import matplotlib.pyplot as plt

In [None]:
import pandas as pd

In [None]:
import math

In [None]:
from sklearn import tree, metrics, svm

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.preprocessing import LabelBinarizer, Imputer

In [None]:
import graphviz

In [None]:
input_file="data.csv"

In [None]:
df = pd.read_csv(input_file, header = 0, sep=',', thousands=',')

In [None]:
ages = df[['iid','age']].groupby(['iid']).mean()

In [None]:
matches = df[['age', 'pid', 'match']].rename({'age': 'sAge'}, axis='columns')
matches['pAge'] = matches.pid.apply(lambda x: math.nan if math.isnan(x) else ages.age[x])
matches = matches.drop(['pid'], axis=1).dropna()
matches.plot.hexbin(x='sAge', y='pAge', C='match', cmap=plt.cm.cool, reduce_C_function=np.mean, gridsize=22, sharex=False, sharey=False)
plt.savefig('plot.pdf', format='pdf')
plt.savefig('plot.png', dpi=400)
plt.show()

In [None]:
# Preprocess data
X = df.copy()
X[['wave', 'attr1_1','sinc1_1','intel1_1','fun1_1','amb1_1','shar1_1',
  'attr2_1','sinc2_1','intel2_1','fun2_1','amb2_1','shar2_1',
  'attr3_1','sinc3_1','intel3_1','fun3_1','amb3_1',
  'attr4_1','sinc4_1','intel4_1','fun4_1','amb4_1','shar4_1',
  'attr5_1','sinc5_1','intel5_1','fun5_1','amb5_1']].query(
    'wave >= 6 or wave <= 9').drop('wave', axis=1).apply(lambda x: x*10)

X.drop(['iid', 'id', 'idg', 'condtn', 'wave', 'round', 'position',
             'positin1', 'order', 'partner', 'pid', 'match',
             'zipcode', # zipcode -> income
             #'undergra', -> {mn_sat, tuition}
             'attr', 'sinc', 'intel', 'fun', 'amb', 'shar', 'like', 'prob',
             'dec',
             'match',
             'you_call', 'them_cal', 'date_3', 'numdat_3', 'num_in_3',
            ], axis=1, inplace=True)

Y = df['dec']

# Copy to avoid looping over the array we're modifying
cols = X.columns.values
for col in cols:
    if X[col].dtypes=='object':
        #print('Classifying {0}'.format(col))
        X = X.drop(col, axis=1)
        # This is really heavy
        #classes = X[col].str.get_dummies().rename(columns=lambda x: 'field-{0}'.format(x).replace(' ',''))
        #X = pd.concat([X,classes])
    elif X[col].dtypes=='float64' and X[col].isnull().values.any():
        #print('Imputing {0}'.format(col))
        # fill in missing values
        if col == 'field_cd' or \
            col == 'gender' or \
            col == 'CHANGE ME': # TODO: add more classifiable fields
            X[[col]]=Imputer(missing_values='NaN', strategy='most_frequent', axis=0).fit_transform(X[[col]])
        else:
            X[[col]]=Imputer(missing_values='NaN', strategy='mean', axis=0).fit_transform(X[[col]])
        

In [None]:
def model(X,y,test_size=0.2,random_state=0,min_samples_split=300):
    X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=test_size, random_state=42)

    clf = tree.DecisionTreeClassifier(min_samples_split=min_samples_split)
    clf = clf.fit(X_train, y_train)

    y_predict = clf.predict(X_test)

    accuracy = metrics.accuracy_score(y_test, y_predict)
    tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_predict).ravel()/len(y_test)
    print("""
    Accuracy: {0:.2f}%
    True negatives: {1:.2f}%\tFalse negatives: {2:.2f}%
    False positives: {3:.2f}%\tTrue positives: {4:.2f}%
    """.format(accuracy, tn, fp, fn, tp))

    return clf

In [None]:
def vizualize(model, out_file=None):
    graph = graphviz.Source(
        tree.export_graphviz(model, out_file=None,
                                feature_names=X.columns,
                                filled=True, rounded=True,
                                special_characters=True))
    if not out_file == None:
        graph.render(out_file)
        
    return graph  

In [None]:
fem_model_80 = model(X[X.gender == 1], Y[X.gender == 1], test_size=0.2, min_samples_split=300)
vizualize(fem_model_80, out_file="fem_80")
vizualize(fem_model_80)

In [None]:
fem_model_20 = model(X[X.gender == 1], Y[X.gender == 1], test_size=0.8, min_samples_split=150)
vizualize(fem_model_20, out_file="fem_20")
vizualize(fem_model_20)

In [None]:
man_model_80 = model(X[X.gender == 0], Y[X.gender == 0], test_size=0.2, min_samples_split=300)
vizualize(man_model_80, out_file="man_80")
vizualize(man_model_80)

In [None]:
uni_model_80 = model(X, Y, test_size=0.2, min_samples_split=420)
vizualize(uni_model_80, out_file="uni_80")
vizualize(uni_model_80)