In [96]:
import pandas as pd
from sklearn.model_selection import train_test_split

Notebook to get all the datasets ready for comparison. Splits each dataset into 5 different train/test splits, cleans up any unwanted columns, and appends the target varaible to each. 

## Compas

In [97]:
compas_train = pd.read_csv('./data/compas_train.csv')
compas_test = pd.read_csv('./data/compas_test.csv')

In [98]:
compas_train.shape

(4222, 12)

In [99]:
compas_test.shape

(528, 12)

In [100]:
compas = compas_train.append(compas_test)
compas = compas.drop('Unnamed: 0', axis=1)

In [101]:
compas_sampled = compas.sample(n=2000, replace=False, random_state=42)

In [102]:
compas_y = compas_sampled.pop('two_year_recid')
compas_X = compas_sampled

In [105]:
datasets = []
for i in range(5):
    compas_X_train, compas_X_test, compas_y_train, compas_y_test = train_test_split(compas_X, compas_y, test_size=0.2, random_state=i)
    datasets.append([compas_X_train, compas_X_test, compas_y_train, compas_y_test])

In [106]:
for i in range(5):
    num = i + 1
    X_train = datasets[i][0]
    X_test = datasets[i][1]
    y_train = datasets[i][2]
    y_test = datasets[i][3]
    X_train.to_csv('./data/processed/compas/compas_train{}_X.csv'.format(num), index=False)
    y_train.to_csv('./data/processed/compas/compas_train{}_y.csv'.format(num), index=False)
    X_test.to_csv('./data/processed/compas/compas_test{}_X.csv'.format(num), index=False)
    y_test.to_csv('./data/processed/compas/compas_test{}_y.csv'.format(num), index=False)

In [107]:
# Double-check
X_train = pd.read_csv('./data/processed/compas/compas_train1_X.csv')
y_train = pd.read_csv('./data/processed/compas/compas_train1_y.csv')
X_test = pd.read_csv('./data/processed/compas/compas_test1_X.csv')
y_test = pd.read_csv('./data/processed/compas/compas_test1_y.csv')
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1600, 10)
(1600, 1)
(400, 10)
(400, 1)


## Adult

In [2]:
adult_X = pd.read_csv('./data/adult_X.csv')
adult_y = pd.read_csv('./data/adult_Y.csv')

In [3]:
datasets = []
for i in range(5):
    adult_X_train, adult_X_test, adult_y_train, adult_y_test = train_test_split(adult_X, adult_y, test_size=0.2, random_state=i)
    datasets.append([adult_X_train, adult_X_test, adult_y_train, adult_y_test])

In [87]:
for i in range(5):
    num = i + 1
    X_train = datasets[i][0]
    X_test = datasets[i][1]
    y_train = datasets[i][2]
    y_test = datasets[i][3]
    X_train.to_csv('./data/processed/adult/adult_train{}_X.csv'.format(num), index=False)
    y_train.to_csv('./data/processed/adult/adult_train{}_y.csv'.format(num), index=False)
    X_test.to_csv('./data/processed/adult/adult_test{}_X.csv'.format(num), index=False)
    y_test.to_csv('./data/processed/adult/adult_test{}_y.csv'.format(num), index=False)

In [95]:
# Double-check
X_train = pd.read_csv('./data/processed/adult/adult_train1_X.csv')
y_train = pd.read_csv('./data/processed/adult/adult_train1_y.csv')
X_test = pd.read_csv('./data/processed/adult/adult_test1_X.csv')
y_test = pd.read_csv('./data/processed/adult/adult_test1_y.csv')
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1616, 98)
(1616, 1)
(404, 98)
(404, 1)


## Communities

In [4]:
communities_X = pd.read_csv('./data/communities_X.csv')
communities_y = pd.read_csv('./data/communities_Y.csv')

In [5]:
datasets = []
for i in range(5):
    communities_X_train, communities_X_test, communities_y_train, communities_y_test = train_test_split(communities_X, communities_y, test_size=0.2, random_state=i)
    datasets.append([communities_X_train, communities_X_test, communities_y_train, communities_y_test])

In [6]:
for i in range(5):
    num = i + 1
    X_train = datasets[i][0]
    X_test = datasets[i][1]
    y_train = datasets[i][2]
    y_test = datasets[i][3]
    X_train.to_csv('./data/processed/communities/communities_train{}_X.csv'.format(num), index=False)
    y_train.to_csv('./data/processed/communities/communities_train{}_y.csv'.format(num), index=False)
    X_test.to_csv('./data/processed/communities/communities_test{}_X.csv'.format(num), index=False)
    y_test.to_csv('./data/processed/communities/communities_test{}_y.csv'.format(num), index=False)

In [7]:
# Double-check
X_train = pd.read_csv('./data/processed/communities/communities_train1_X.csv')
y_train = pd.read_csv('./data/processed/communities/communities_train1_y.csv')
X_test = pd.read_csv('./data/processed/communities/communities_test1_X.csv')
y_test = pd.read_csv('./data/processed/communities/communities_test1_y.csv')
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1595, 123)
(1595, 1)
(399, 123)
(399, 1)


## Law School

In [8]:
lawschool_X = pd.read_csv('./data/lawschool_X.csv')
lawschool_y = pd.read_csv('./data/lawschool_y.csv')

In [9]:
datasets = []
for i in range(5):
    lawschool_X_train, lawschool_X_test, lawschool_y_train, lawschool_y_test = train_test_split(lawschool_X, lawschool_y, test_size=0.2, random_state=i)
    datasets.append([lawschool_X_train, lawschool_X_test, lawschool_y_train, lawschool_y_test])

In [10]:
for i in range(5):
    num = i + 1
    X_train = datasets[i][0]
    X_test = datasets[i][1]
    y_train = datasets[i][2]
    y_test = datasets[i][3]
    X_train.to_csv('./data/processed/lawschool/lawschool_train{}_X.csv'.format(num), index=False)
    y_train.to_csv('./data/processed/lawschool/lawschool_train{}_y.csv'.format(num), index=False)
    X_test.to_csv('./data/processed/lawschool/lawschool_test{}_X.csv'.format(num), index=False)
    y_test.to_csv('./data/processed/lawschool/lawschool_test{}_y.csv'.format(num), index=False)

In [11]:
# Double-check
X_train = pd.read_csv('./data/processed/lawschool/lawschool_train1_X.csv')
y_train = pd.read_csv('./data/processed/lawschool/lawschool_train1_y.csv')
X_test = pd.read_csv('./data/processed/lawschool/lawschool_test1_X.csv')
y_test = pd.read_csv('./data/processed/lawschool/lawschool_test1_y.csv')
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1458, 17)
(1458, 1)
(365, 17)
(365, 1)


In [12]:
laws.head(5)

Unnamed: 0,cluster,lsat,ugpa,zfygpa,zgpa,fulltime,fam_inc,age,gender,race1,race2,race3,race4,race5,race6,race7,race8
0,0.167522,0.192135,-0.067494,-0.500121,-0.685302,-0.321731,0.702957,-0.229262,-0.908199,-0.084749,-0.225212,-0.357902,-0.155426,-0.110523,-0.189197,0.58094,-0.115502
1,-1.322194,-0.09444,-0.746029,0.071791,-1.151823,-0.321731,0.702957,-0.053729,-0.908199,-0.084749,-0.225212,-0.357902,-0.155426,-0.110523,-0.189197,0.58094,-0.115502
2,0.167522,-0.145012,1.289576,0.217368,0.131111,-0.321731,0.702957,-0.229262,1.101081,-0.084749,-0.225212,-0.357902,-0.155426,-0.110523,-0.189197,0.58094,-0.115502
3,0.91238,2.215014,1.289576,0.498125,0.306056,-0.321731,0.702957,-0.58033,1.101081,-0.084749,4.44026,-0.357902,-0.155426,-0.110523,-0.189197,-1.721349,-0.115502
4,0.167522,0.192135,-0.293672,-0.864065,-0.238219,-0.321731,0.702957,1.175006,-0.908199,-0.084749,-0.225212,-0.357902,-0.155426,-0.110523,-0.189197,0.58094,-0.115502
