In [75]:
import pandas as pd
from sklearn.model_selection import train_test_split

Notebook to get all the datasets ready for comparison. Splits each dataset into 5 different train/test splits, cleans up any unwanted columns, and appends the target varaible to each. 

## Compas

In [76]:
compas_train = pd.read_csv('./data/compas_train.csv')
compas_test = pd.read_csv('./data/compas_test.csv')

In [77]:
compas_train.shape

(4222, 12)

In [78]:
compas_test.shape

(528, 12)

In [79]:
compas = compas_train.append(compas_test)
compas = compas.drop('Unnamed: 0', axis=1)
compas.head(5)

Unnamed: 0,sex,race,age_cat=25 to 45,age_cat=Greater than 45,age_cat=Less than 25,priors_count=0,priors_count=1 to 3,priors_count=More than 3,c_charge_degree=F,c_charge_degree=M,two_year_recid
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
4,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0


In [80]:
compas_y = compas.pop('two_year_recid')
compas_X = compas

In [81]:
datasets = []
for i in range(5):
    compas_X_train, compas_X_test, compas_y_train, compas_y_test = train_test_split(compas_X, compas_y, test_size=0.2, random_state=i)
    datasets.append([compas_X_train, compas_X_test, compas_y_train, compas_y_test])

In [82]:
for i in range(5):
    num = i + 1
    X_train = datasets[i][0]
    X_test = datasets[i][1]
    y_train = datasets[i][2]
    y_test = datasets[i][3]
    X_train.to_csv('./data/processed/compas/compas_train{}_X.csv'.format(num), index=False)
    y_train.to_csv('./data/processed/compas/compas_train{}_y.csv'.format(num), index=False)
    X_test.to_csv('./data/processed/compas/compas_test{}_X.csv'.format(num), index=False)
    y_test.to_csv('./data/processed/compas/compas_test{}_y.csv'.format(num), index=False)

In [83]:
# Double-check
X_train = pd.read_csv('./data/processed/compas/compas_train1_X.csv')
y_train = pd.read_csv('./data/processed/compas/compas_train1_y.csv')
X_test = pd.read_csv('./data/processed/compas/compas_test1_X.csv')
y_test = pd.read_csv('./data/processed/compas/compas_test1_y.csv')
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(3800, 10)
(3800, 1)
(950, 10)
(950, 1)


## Adult

In [85]:
adult_X = pd.read_csv('./data/adult_X.csv')
adult_y = pd.read_csv('./data/adult_Y.csv')

In [86]:
datasets = []
for i in range(5):
    adult_X_train, adult_X_test, adult_y_train, adult_y_test = train_test_split(adult_X, adult_y, test_size=0.2, random_state=i)
    datasets.append([adult_X_train, adult_X_test, adult_y_train, adult_y_test])

In [87]:
for i in range(5):
    num = i + 1
    X_train = datasets[i][0]
    X_test = datasets[i][1]
    y_train = datasets[i][2]
    y_test = datasets[i][3]
    X_train.to_csv('./data/processed/adult/adult_train{}_X.csv'.format(num), index=False)
    y_train.to_csv('./data/processed/adult/adult_train{}_y.csv'.format(num), index=False)
    X_test.to_csv('./data/processed/adult/adult_test{}_X.csv'.format(num), index=False)
    y_test.to_csv('./data/processed/adult/adult_test{}_y.csv'.format(num), index=False)

In [95]:
# Double-check
X_train = pd.read_csv('./data/processed/adult/adult_train1_X.csv')
y_train = pd.read_csv('./data/processed/adult/adult_train1_y.csv')
X_test = pd.read_csv('./data/processed/adult/adult_test1_X.csv')
y_test = pd.read_csv('./data/processed/adult/adult_test1_y.csv')
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1616, 98)
(1616, 1)
(404, 98)
(404, 1)


## Communities

In [98]:
communities_X = pd.read_csv('./data/communities_X.csv')
communities_y = pd.read_csv('./data/communities_Y.csv')

In [99]:
datasets = []
for i in range(5):
    communities_X_train, communities_X_test, communities_y_train, communities_y_test = train_test_split(communities_X, communities_y, test_size=0.2, random_state=i)
    datasets.append([communities_X_train, communities_X_test, communities_y_train, communities_y_test])

In [100]:
for i in range(5):
    num = i + 1
    X_train = datasets[i][0]
    X_test = datasets[i][1]
    y_train = datasets[i][2]
    y_test = datasets[i][3]
    X_train.to_csv('./data/processed/communities/communities_train{}_X.csv'.format(num), index=False)
    y_train.to_csv('./data/processed/communities/communities_train{}_y.csv'.format(num), index=False)
    X_test.to_csv('./data/processed/communities/communities_test{}_X.csv'.format(num), index=False)
    y_test.to_csv('./data/processed/communities/communities_test{}_y.csv'.format(num), index=False)

In [102]:
# Double-check
X_train = pd.read_csv('./data/processed/communities/communities_train1_X.csv')
y_train = pd.read_csv('./data/processed/communities/communities_train1_y.csv')
X_test = pd.read_csv('./data/processed/communities/communities_test1_X.csv')
y_test = pd.read_csv('./data/processed/communities/communities_test1_y.csv')
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1595, 123)
(1595, 1)
(399, 123)
(399, 1)


## Law School

In [103]:
lawschool_X = pd.read_csv('./data/lawschool_X.csv')
lawschool_y = pd.read_csv('./data/lawschool_y.csv')

In [104]:
datasets = []
for i in range(5):
    lawschool_X_train, lawschool_X_test, lawschool_y_train, lawschool_y_test = train_test_split(lawschool_X, lawschool_y, test_size=0.2, random_state=i)
    datasets.append([lawschool_X_train, lawschool_X_test, lawschool_y_train, lawschool_y_test])

In [105]:
for i in range(5):
    num = i + 1
    X_train = datasets[i][0]
    X_test = datasets[i][1]
    y_train = datasets[i][2]
    y_test = datasets[i][3]
    X_train.to_csv('./data/processed/lawschool/lawschool_train{}_X.csv'.format(num), index=False)
    y_train.to_csv('./data/processed/lawschool/lawschool_train{}_y.csv'.format(num), index=False)
    X_test.to_csv('./data/processed/lawschool/lawschool_test{}_X.csv'.format(num), index=False)
    y_test.to_csv('./data/processed/lawschool/lawschool_test{}_y.csv'.format(num), index=False)

In [106]:
# Double-check
X_train = pd.read_csv('./data/processed/lawschool/lawschool_train1_X.csv')
y_train = pd.read_csv('./data/processed/lawschool/lawschool_train1_y.csv')
X_test = pd.read_csv('./data/processed/lawschool/lawschool_test1_X.csv')
y_test = pd.read_csv('./data/processed/lawschool/lawschool_test1_y.csv')
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1458, 17)
(1458, 1)
(365, 17)
(365, 1)
