In [2]:
import pandas as pd
import numpy as np
import random
import statistics
import math
from dateutil import parser
from datetime import datetime
from functools import reduce
import random
from random import choices
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [100]:

def normalize(cov):
    cols = []
    
    if isinstance(cov, pd.DataFrame):
        cols = cov.columns
    else:
        cols = list(cov.axes[0])
        
    for col in cols:
        if col != 'cov:c':
            cov[col] = cov[col]/cov['cov:c']
    cov['cov:c'] = 1
    return cov

class agg_dataset:
    def set_meta(self, data, X, y, dimensions, name):
        self.data = data
        self.dimensions = dimensions
        self.y = y
        self.X =  [self.y] + X
        self.name = name
        self.datasets = set()
        self.datasets.add(self.name)
        
    def load_buyer(self, data, X, y, dimensions, name):
        self.data = data
        self.dimensions = dimensions
        self.X = X
        self.y = y
        self.name = name
        
        # don't impute y
        self.to_numeric(self.y, False)
        for x in self.X:
            self.to_numeric(x, False)
        
        self.X =  [self.y] + self.X
        
        dedup_dimensions = set()
        for d in dimensions:
            if isinstance(d, list):
                dedup_dimensions.update(d)
            else:
                dedup_dimensions.add(d)
        dedup_dimensions = list(dedup_dimensions)
        
        # project out attributes except x, y, dim
        self.data = self.data[self.X  + dedup_dimensions]
        
        self.datasets = set()
        self.datasets.add(self.name)
        
    
    def compute_agg(self, norm = False):
        self.lift(self.name, self.X)
        
        self.agg_dimensions = dict()
        
        for d in self.dimensions:
            if isinstance(d, list):
                self.agg_dimensions[tuple(d)] = self.data[list(filter(lambda col: col.startswith("cov:"), self.data.columns)) + d].groupby(d).sum()
            else:
                self.agg_dimensions[d] = self.data[list(filter(lambda col: col.startswith("cov:"), self.data.columns)) + [d]].groupby(d).sum()
            
        if norm:
            for d in self.agg_dimensions.keys():
                self.agg_dimensions[d] = normalize(self.agg_dimensions[d])
            
        self.covariance = normalize(self.data[list(filter(lambda col: col.startswith("cov:"), self.data.columns))].sum())
        
        self.X = [self.name + ':' + x for x in self.X]
    
    def load_seller(self, data, dimensions, name, feature_transform = True):
        self.data = data
        self.dimensions = dimensions
        self.name = name
        
                
        dedup_dimensions = set()
        for d in dimensions:
            if isinstance(d, list):
                dedup_dimensions.update(d)
            else:
                dedup_dimensions.add(d)
        dedup_dimensions = list(dedup_dimensions)
        
        
        # find numeric attributes as features
        atts = []
        for att in self.data.columns:
            if att in dedup_dimensions:
                continue
            cond, col = self.is_numeric(att, 0.3, 2)
            if cond:
                self.data[att] = col
                atts.append(att)
                if feature_transform:
                    self.data["log" + att] = np.log(self.data[att])
                    self.data["sq" + att] = np.square(self.data[att])
                    self.data["cbr" + att] = np.cbrt(self.data[att])

                    atts.append("log" + att)
                    atts.append("sq" + att)
                    atts.append("cbr" + att)

        self.X = atts

        
        # project out attributes except x, y, dim
        self.data = self.data[self.X + dedup_dimensions]

    def is_numeric(self, att, impute_rate, cardinality):
        col = pd.to_numeric(self.data[att],errors="coerce")
        nan_count = sum(np.isnan(col))
        unique_count = len(col.unique())
        if nan_count/len(self.data) < impute_rate and unique_count/len(self.data) < cardinality:
            mean_value = col.mean()
            col.fillna(value=mean_value, inplace=True)
            return True, col
        else:
            return False, col
    
    def to_numeric(self, att, impute=True, impute_rate = 1):
        # parse attribute to numeric
        self.data[att] = pd.to_numeric(self.data[att],errors="coerce")
        # count the number of nan
        nan_count = sum(np.isnan(self.data[att]))
        
        if impute:
            # impute error only if missing rate is not above threshold
            if nan_count/len(self.data) < impute_rate:
                mean_value=self.data[att].mean()
                self.data[att].fillna(value=mean_value, inplace=True)
                return True
            else:
                return False
        else:
            # else, remove records with missing value
            self.data = self.data[~np.isnan(self.data[att])]
    
    def lift(self, tablename, attributes):
        self.data['cov:c'] = 1

        for i in range(len(attributes)):
            for j in range(i, len(attributes)):
                self.data['cov:Q:' + tablename + ":" + attributes[i] + ","+ tablename + ":" + attributes[j]] = self.data[attributes[i]] * self.data[attributes[j]]

        for attribute in attributes:
            self.data= self.data.rename(columns = {attribute:'cov:s:' + tablename + ":" + attribute})
    
    def absorb(self, agg_data, dimension, attrs):
        
        if agg_data.name in self.datasets:
            print("already absorbed this data")
            return
        
        self.data = connect(self, agg_data, dimension, True, attrs)
        
        for d in self.dimensions:
            if isinstance(d, list):
                self.agg_dimensions[tuple(d)] = self.data[list(filter(lambda col: col.startswith("cov:"), self.data.columns)) + d].groupby(d).sum()
            else:
                self.agg_dimensions[d] = self.data[list(filter(lambda col: col.startswith("cov:"), self.data.columns)) + [d]].groupby(d).sum()
            
        self.covariance = normalize(self.data[list(filter(lambda col: col.startswith("cov:"), self.data.columns))].sum())
        
        self.X = self.X + attrs
        self.datasets.add(agg_data.name)
    
# return the coefficients of features and a constant 
def linear_regression(cov_matrix, features, result):
    a = np.empty([len(features) + 1, len(features) + 1])
    b = np.empty(len(features) + 1)
    
    for i in range(len(features)):
        for j in range(len(features)):
            if 'cov:Q:' + features[i] + ","+ features[j] in cov_matrix:
                a[i][j] = cov_matrix['cov:Q:' + features[i] + ","+ features[j]]
            else:
                a[i][j] = cov_matrix['cov:Q:' + features[j] + ","+ features[i]]
    
    for i in range(len(features)):
        a[i][len(features)] = cov_matrix['cov:s:' + features[i]]
        a[len(features)][i] = cov_matrix['cov:s:' + features[i]]
        if 'cov:Q:' + result + "," + features[i] in cov_matrix:
            b[i] = cov_matrix['cov:Q:' + result + "," + features[i]]
        else:
            b[i] = cov_matrix['cov:Q:' + features[i] + "," + result]
    
    b[len(features)] = cov_matrix['cov:s:' + result]
    
    a[len(features)][len(features)] = cov_matrix['cov:c']
#     print(a,b)
    return np.linalg.solve(a, b)

def square_error(cov_matrix, features, result, parameter):
    se = cov_matrix['cov:Q:'  + result + "," + result]
    
#     print(se)
    for i in range(len(features)):
        for j in range(len(features)):
            if 'cov:Q:'  + features[i] + "," + features[j] in cov_matrix:
                se += parameter[i] * parameter[j] * cov_matrix['cov:Q:'  + features[i] + "," + features[j]]
            else:    
                se += parameter[j] * parameter[i] * cov_matrix['cov:Q:'  + features[j] + "," + features[i]]
#             print(se, 'cov:Q:'  + features[i] + "," + features[j])
   
    
    for i in range(len(features)):
        se += 2 * parameter[i] * parameter[-1] * cov_matrix['cov:s:'  + features[i]]
        if 'cov:Q:' + result + "," + features[i] in cov_matrix:
            se -= 2 * parameter[i] *  cov_matrix['cov:Q:' + result + "," + features[i]]
        else:
            se -= 2 * parameter[i] *  cov_matrix['cov:Q:' + features[i] + "," + result]
    
#     print(se)
    se -= 2 * parameter[-1] * cov_matrix['cov:s:'  + result]
    se += cov_matrix['cov:c'] * parameter[-1] * parameter[-1]

    return se

def total_sum_of_square(cov_matrix, result):
    return cov_matrix['cov:Q:'  + result + "," + result] - cov_matrix['cov:s:'  + result] * cov_matrix['cov:s:'  + result] / cov_matrix['cov:c']

def mean_squared_error(cov_matrix, features, result, parameter):
    return square_error(cov_matrix, features, result, parameter)/cov_matrix['cov:c']


def r2(cov_matrix, features, result, parameter):
    result =  1 - square_error(cov_matrix, features, result, parameter)/total_sum_of_square(cov_matrix, result)
    if result > 1:
        # overflow
        return -1
    return result

def adjusted_r2(cov_matrix, features, result, parameter):
    return 1 - (cov_matrix['cov:c']-1)*(1 - r2(cov_matrix, features, result, parameter))/(cov_matrix['cov:c'] - len(parameter) - 1)

def connect(aggdata1, aggdata2, dimension, left_inp = False, right_attrs = []):
    
    if isinstance(dimension, list):
        dimension = tuple(dimension)
    
    if left_inp:
        agg1 = aggdata1.data
    else:
        agg1 = aggdata1.agg_dimensions[dimension]
        
    agg2 = aggdata2.agg_dimensions[dimension]
    
    left_attributes = aggdata1.X
    left_tablename = aggdata1.name
    right_attributes = aggdata2.X
    right_tablename = aggdata2.name
    
    if len(right_attrs) > 0:
        kept_cols = []
        for col in agg2.columns:
            names = col[6:].split(",")
            match = True
            for name in names:
                if name not in right_attrs:
                    match = False
            if match:
                kept_cols.append(col)
        agg2 = agg2[kept_cols + ['cov:c']]
        right_attributes = right_attrs
    
    
    if left_inp:
        join = pd.merge(agg1, agg2, how='left', left_on=dimension, right_index=True)
    else:
        join = pd.merge(agg1, agg2, how='left', left_index=True, right_index=True)
#         join = pd.merge(agg1, agg2, how='inner', left_index=True, right_index=True)
    join = join.drop('cov:c_y', 1)
    join = join.rename(columns = {'cov:c_x':'cov:c'})
    
    right_cov = aggdata2.covariance

    
    # fill in nan
    for att2 in right_attributes:
        join['cov:s:' + att2].fillna(value=right_cov['cov:s:' + att2], inplace=True)
        join['cov:s:' + att2] *= join['cov:c']
    
#     for i in range(len(right_attributes)):
#         for j in range(i, len(right_attributes)):
#             if 'cov:Q:' + right_attributes[i] + "," + right_attributes[j] in join:
#                 join['cov:Q:' + right_attributes[i] + "," + right_attributes[j]].fillna(value=right_cov['cov:Q:' + right_attributes[i] + "," + right_attributes[j]], inplace=True)
#                 join['cov:Q:' + right_attributes[i] + "," + right_attributes[j]] *= join['cov:c']
#             else:
#                 join['cov:Q:' + right_attributes[j] + "," + right_attributes[i]].fillna(value=right_cov['cov:Q:' + right_attributes[j] + "," + right_attributes[i]], inplace=True)
#                 join['cov:Q:' + right_attributes[j] + "," + right_attributes[i]] *= join['cov:c']
            
    
    
#     for att1 in left_attributes:
#         for att2 in right_attributes:
#             if 'cov:Q:' + att1 + "," + att2 in join:
#                 join['cov:Q:' + att1 + "," + att2] = join['cov:s:' + att1] * join['cov:s:' + att2]/join['cov:c']
#             else:
#                 join['cov:Q:' + att2 + "," + att1] = join['cov:s:' + att2] * join['cov:s:' + att1]/join['cov:c']
    
    
    return join

def select_features(train, test, seller, dimension, k):
    join_test = connect(test, seller, dimension)
    join_train = connect(train, seller, dimension)

    cur_atts = []
    join_train_cov = join_train.sum()
    join_test_cov = join_test.sum()
    final_r2 = 0
    
    for i in range(k):
        best_r2 = 0
        best_att = -1
        for att in train.X + seller.X:
            if att in cur_atts or att == train.name + ":" + train.y:
                continue
            # maybe singular
            try:
                parameter = linear_regression(join_train_cov, cur_atts + [att], train.name + ":" + train.y)
            except:
                continue
            cur_r2 = r2(join_test_cov, cur_atts + [att], train.name + ":" + train.y, parameter)
    #         print(cur_r2, att)
            if cur_r2 > best_r2:
                best_r2 = cur_r2
                best_att = att
        if best_r2 == 0 or best_r2 < final_r2:
            break
        cur_atts = cur_atts + [best_att]
        final_r2 = best_r2
        print(i, best_r2, cur_atts)
    return cur_atts, final_r2

In [44]:
gender = pd.read_csv("gender.csv")
gender_train = agg_dataset()
gender_test = agg_dataset()
gender_train.load_buyer(gender, ["Number Tested"], "Mean Scale Score", ["DBN", ["DBN","Grade"], "School Name", "Year", "Category"], "gender")
msk = np.random.rand(len(gender_train.data)) < 0.8
gender_test.set_meta(gender_train.data[~msk], ["Number Tested"], "Mean Scale Score", ["DBN", ["DBN","Grade"], "School Name", "Year", "Category"], "gender")
gender_train.data = gender_train.data[msk]
gender_test.compute_agg()
gender_train.compute_agg()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [42]:
crime = pd.read_csv("crime.csv")
crimedata = agg_dataset()
crimedata.load_seller(crime, ["DBN"], "crime")
crimedata.compute_agg(True)

esl = pd.read_csv("esl.csv")
esldata = agg_dataset()
esldata.load_seller(esl, [["DBN","Grade"]], "esl")
esldata.compute_agg(True)

ap = pd.read_csv("ap.csv")
apdata = agg_dataset()
apdata.load_seller(ap, ["DBN"], "ap")
apdata.compute_agg(True)

survey = pd.read_csv("2013_NYC_School_Survey.csv")
surveydata = agg_dataset()
surveydata.load_seller(survey, ["DBN"], "survey")
surveydata.compute_agg(True)

base = pd.read_csv("base.csv")
basedata = agg_dataset()
basedata.load_seller(base, ["DBN"], "base")
basedata.compute_agg(True)

disc = pd.read_csv("disc.csv")
discdata = agg_dataset()
discdata.load_seller(disc, ["DBN"], "disc")
discdata.compute_agg(True)

math = pd.read_csv("math.csv")
mathdata = agg_dataset()
mathdata.load_seller(math, [["DBN","Grade"]], "math")
mathdata.compute_agg(True)

oss = pd.read_csv("oss.csv")
ossdata = agg_dataset()
ossdata.load_seller(oss, ["DBN"], "oss")
ossdata.compute_agg(True)

pe = pd.read_csv("pe.csv")
pedata = agg_dataset()
pedata.load_seller(pe, ["DBN"], "pe")
pedata.compute_agg(True)

s2tr = pd.read_csv("s2tr.csv")
s2trdata = agg_dataset()
s2trdata.load_seller(s2tr, ["DBN"], "s2tr")
s2trdata.compute_agg(True)

sat = pd.read_csv("sat.csv")
satdata = agg_dataset()
satdata.load_seller(sat, ["DBN"], "sat")
satdata.compute_agg(True)

pro = pd.read_csv("Schools_Progress_Report_2012-2013.csv")
prodata = agg_dataset()
prodata.load_seller(pro, ["DBN"], "pro")
prodata.compute_agg(True)


spy = pd.read_csv("spy.csv")
spydata = agg_dataset()
spydata.load_seller(spy, ["Year"], "spy")
spydata.compute_agg(True)

transfer = pd.read_csv("transfer.csv")
transferdata = agg_dataset()
transferdata.load_seller(transfer, ["DBN"], "transfer")
transferdata.compute_agg(True)

yabc = pd.read_csv("yabc.csv")
yabcdata = agg_dataset()
yabcdata.load_seller(yabc, ["DBN"], "yabc")
yabcdata.compute_agg(True)

dm1 = pd.read_csv("other/datamart.socrata.data-cityofnewyork-us.22rr-ujq3")
dm1data = agg_dataset()
dm1data.load_seller(dm1, ["DBN"], "dm1")
dm1data.compute_agg(True)

dm2 = pd.read_csv("other/datamart.socrata.data-cityofnewyork-us.25aa-q86c")
dm2data = agg_dataset()
dm2data.load_seller(dm2, ["DBN"], "dm2")
dm2data.compute_agg(True)

dm3 = pd.read_csv("other/datamart.socrata.data-cityofnewyork-us.29bv-qqsy")
dm3data = agg_dataset()
dm3data.load_seller(dm3, ["DBN"], "dm3")
dm3data.compute_agg(True)

dm4 = pd.read_csv("other/datamart.socrata.data-cityofnewyork-us.29ry-u5bf")
dm4data = agg_dataset()
dm4data.load_seller(dm4, ["DBN"], "dm4")
dm4data.compute_agg(True)

dm5 = pd.read_csv("other/datamart.socrata.data-cityofnewyork-us.43qc-8vv8")
dm5data = agg_dataset()
dm5data.load_seller(dm5, [["DBN","Grade"]], "dm5")
dm5data.compute_agg(True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [438]:
datamart.socrata.data-cityofnewyork-us.436j-ja87
gender

SyntaxError: invalid syntax (950661656.py, line 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [306]:
esldata.data["cov:s:esl:logLevel 3+4 %"]

0        3.939638
1        2.856470
2        3.218876
3        3.793239
4        3.526361
           ...   
28460    1.280934
28461        -inf
28462        -inf
28463        -inf
28464    1.386294
Name: cov:s:esl:logLevel 3+4 %, Length: 28465, dtype: float64

In [62]:
%%time
# join_test = connect(gender_test, esldata, ("DBN","Grade"))
# join_train = connect(gender_train, esldata, ("DBN","Grade"))
join_test = connect(gender_test, crimedata, "DBN")
join_train = connect(gender_train, crimedata, "DBN")

CPU times: user 63.7 ms, sys: 437 µs, total: 64.1 ms
Wall time: 61.4 ms




In [59]:
crime = pd.read_csv("crime.csv")
crimedata = agg_dataset()
crimedata.load_seller(crime, ["DBN"], "crime", False)
crimedata.compute_agg(True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [264]:
parameter = linear_regression(gender_train.covariance, ["gender:Number Tested"], "gender:Mean Scale Score")

r2(gender_train.covariance, ["gender:Number Tested"], "gender:Mean Scale Score", parameter)

[[1.54759312e+04 8.19853509e+01]
 [8.19853509e+01 1.00000000e+00]] [24725.27444603   299.47019938]


0.010018145240012322

In [262]:
from sklearn.metrics import r2_score
reg = LinearRegression().fit(gender_train.data[["cov:s:gender:Number Tested"]], gender_train.data[["cov:s:gender:Mean Scale Score"]])
r2_score(gender_train.data[["cov:s:gender:Mean Scale Score"]],reg.predict(gender_train.data[["cov:s:gender:Number Tested"]]))

0.010018145239966136

In [256]:
np.square(gender_train.data[["cov:s:gender:Mean Scale Score"]] - reg.predict(gender_train.data[["cov:s:gender:Number Tested"]])).sum()

cov:s:gender:Mean Scale Score    1.279190e+07
dtype: float64

In [217]:
gender_train.data[["cov:s:gender:Mean Scale Score"]]

Unnamed: 0,cov:s:gender:Mean Scale Score
0,285.0
1,292.0
4,289.0
8,308.0
9,297.0
...,...
47727,273.0
47729,280.0
47730,274.0
47732,282.0


[[0.00076816]]


array([-34.07552038])

In [202]:
(esl[(esl["DBN"] == "01M015") & (esl["Grade"] == '3')]["sqLevel 3+4 %"] * esl[(esl["DBN"] == "01M015") & (esl["Grade"] == '3')]["sqLevel 3+4 %"]).sum()/6

1947484.4285833333

In [203]:
gender_train.data["cov:s:esl:sqLevel 3+4 %"] * gender_train.data["cov:s:esl:sqLevel 3+4 %"]

0        1.209600e+06
1        1.209600e+06
4        1.209600e+06
8        1.209600e+06
9        1.209600e+06
             ...     
47727    1.064821e+07
47729    1.064821e+07
47730    1.064821e+07
47732    1.064821e+07
47733    1.064821e+07
Name: cov:s:esl:sqLevel 3+4 %, Length: 37790, dtype: float64

In [142]:
gender_train.data[["cov:s:gender:Mean Scale Score"]]

Unnamed: 0,cov:s:gender:Mean Scale Score
0,285.0
1,292.0
4,289.0
8,308.0
9,297.0
...,...
47727,273.0
47729,280.0
47730,274.0
47732,282.0


In [311]:
gender_test.

AttributeError: 'agg_dataset' object has no attribute 'y'

In [440]:
sellers = [(crimedata, "DBN"), (apdata, "DBN"), (surveydata, "DBN"), 
           (basedata, "DBN"), (discdata, "DBN"), (mathdata, ("DBN","Grade")), 
           (ossdata, "DBN"), (pedata, "DBN"), (s2trdata, "DBN"), 
           (satdata, "DBN"), (prodata, "DBN"), (spydata, "Year"),
           (transferdata, "DBN"), (yabcdata, "DBN"), (dm1data, "DBN"),
           (dm2data, "DBN"), (dm3data, "DBN"), (dm4data, "DBN"), (dm5data, ("DBN","Grade"))]

In [499]:
bought

{'base', 'dm4'}

In [574]:
# find m best datasets to augment
bought = set()
m = 1

for i in range(m):
    best_seller = None
    best_seller_attrs = []
    best_dimension = None
    best_r2 = 0

    for sellerdata, dimension in sellers:
        # check if current seller has been bought
        if sellerdata.name in bought:
            continue

        # find the attributes and r2 of augmenting
        cur_atts, final_r2 = select_features(gender_train, gender_test, sellerdata, dimension,10)

        if final_r2 > best_r2:
            best_seller = sellerdata
            best_dimension = dimension
            best_seller_attrs = cur_atts
            best_r2 = final_r2


    print(best_seller.name, best_seller_attrs, best_r2)
    
#     # absorb the best seller
#     gender_train.absorb(best_seller, best_dimension, [x for x in best_seller_attrs if x in best_seller.X])
#     gender_test.absorb(best_seller, best_dimension, [x for x in best_seller_attrs if x in best_seller.X])
#     bought.add(best_seller.name)



0 0.07008556464513782 ['crime:logRegister']
1 0.09866969403492132 ['crime:logRegister', 'crime:sqID']
2 0.1278497351538167 ['crime:logRegister', 'crime:sqID', 'crime:ID']
3 0.1319255409651704 ['crime:logRegister', 'crime:sqID', 'crime:ID', 'crime:cbrID']
4 0.13487992475962263 ['crime:logRegister', 'crime:sqID', 'crime:ID', 'crime:cbrID', 'crime:sqRegister']
5 0.1374054725723043 ['crime:logRegister', 'crime:sqID', 'crime:ID', 'crime:cbrID', 'crime:sqRegister', 'crime:logGeographical District Code']
6 0.1607628817190735 ['crime:logRegister', 'crime:sqID', 'crime:ID', 'crime:cbrID', 'crime:sqRegister', 'crime:logGeographical District Code', 'crime:cbrGeographical District Code']
7 0.1626230556071261 ['crime:logRegister', 'crime:sqID', 'crime:ID', 'crime:cbrID', 'crime:sqRegister', 'crime:logGeographical District Code', 'crime:cbrGeographical District Code', 'crime:cbrRegister']
8 0.16428924045913862 ['crime:logRegister', 'crime:sqID', 'crime:ID', 'crime:cbrID', 'crime:sqRegister', 'crime:



0 0.37951612278217706 ['math:Level 4 %']
1 0.3905579075010066 ['math:Level 4 %', 'math:cbrLevel 1 %']
2 0.39209470082853326 ['math:Level 4 %', 'math:cbrLevel 1 %', 'math:logYear']
3 0.4270507498116056 ['math:Level 4 %', 'math:cbrLevel 1 %', 'math:logYear', 'math:cbrYear']
4 0.7315913422540855 ['math:Level 4 %', 'math:cbrLevel 1 %', 'math:logYear', 'math:cbrYear', 'math:sqLevel 2 #']
0 0.03225966457174956 ['oss:sqADMINISTRATIVE DISTRICT']
1 0.08158783453150942 ['oss:sqADMINISTRATIVE DISTRICT', 'oss:logADMINISTRATIVE DISTRICT']
2 0.11236067657666193 ['oss:sqADMINISTRATIVE DISTRICT', 'oss:logADMINISTRATIVE DISTRICT', 'oss:ADMINISTRATIVE DISTRICT']
3 0.16241665297085695 ['oss:sqADMINISTRATIVE DISTRICT', 'oss:logADMINISTRATIVE DISTRICT', 'oss:ADMINISTRATIVE DISTRICT', 'oss:cbrADMINISTRATIVE DISTRICT']
4 0.16733462344976235 ['oss:sqADMINISTRATIVE DISTRICT', 'oss:logADMINISTRATIVE DISTRICT', 'oss:ADMINISTRATIVE DISTRICT', 'oss:cbrADMINISTRATIVE DISTRICT', 'gender:Number Tested']
0 0.091549953

In [471]:
pro ['pro:sq2012-2013 PERFORMANCE CATEGORY SCORE', 'pro:cbr2012-2013 ADDITIONAL CREDIT', 'pro:DISTRICT', 'pro:logDISTRICT', 'pro:sqDISTRICT', 'pro:log2012-2013 PROGRESS CATEGORY SCORE', 'pro:2012-13 OVERALL PERCENTILE', 'pro:cbrDISTRICT', 'pro:2012-2013 PROGRESS CATEGORY SCORE', 'pro:sq2012-2013 ADDITIONAL CREDIT'] 0.434421723927944
math ['math:Level 4 %', 'pro:sq2012-2013 PERFORMANCE CATEGORY SCORE', 'pro:cbr2012-2013 ADDITIONAL CREDIT', 'math:cbrLevel 1 %', 'pro:DISTRICT', 'pro:logDISTRICT', 'pro:sqDISTRICT', 'pro:cbrDISTRICT', 'math:logYear', 'math:Year'] 0.5164984123513657
dm2 ['math:Level 4 %', 'pro:sq2012-2013 PERFORMANCE CATEGORY SCORE', 'dm2:sqMATH %Level 1', 'pro:cbr2012-2013 ADDITIONAL CREDIT', 'dm2:sq%Hispanic', 'dm2:logTotal Enrollment', 'dm2:logELA %Level 1', 'math:cbrLevel 1 %', 'dm2:sq%Male', 'dm2:logMATH %Level 2'] 0.555674640545412
spy ['math:Level 4 %', 'pro:sq2012-2013 PERFORMANCE CATEGORY SCORE', 'dm2:sqMATH %Level 1', 'spy:logYear Close ', 'pro:cbr2012-2013 ADDITIONAL CREDIT', 'dm2:sq%Hispanic', 'dm2:logTotal Enrollment', 'dm2:logELA %Level 1', 'spy:sqYear Low ', 'spy:Year High '] 0.5934589790270721
s2tr ['math:Level 4 %', 'pro:sq2012-2013 PERFORMANCE CATEGORY SCORE', 's2tr:sqSchool Pupil-Teacher Ratio', 'spy:logYear Close ', 'dm2:sqMATH %Level 1', 'pro:cbr2012-2013 ADDITIONAL CREDIT', 'dm2:logTotal Enrollment', 'dm2:sq%Hispanic', 'dm2:logELA %Level 1', 'spy:sqYear Low '] 0.6131388913970008


['pro:sq2012-2013 PERFORMANCE CATEGORY SCORE',
 'pro:cbr2012-2013 ADDITIONAL CREDIT',
 'dm2:MATH %Level 1',
 'dm2:sq%Hispanic',
 'dm2:logTotal Enrollment',
 'dm2:logMATH %Level 2',
 'dm2:logMATH #Level 1',
 'dm2:cbrMath #Test Takers',
 'dm2:log%Female',
 'dm2:logMATH #Level 2']

In [483]:
[x for x in best_seller_attrs if x in best_seller.X]

['s2tr:sqSchool Pupil-Teacher Ratio']

In [541]:
cur_atts, final_r2 = select_features(gender_train, gender_test, basedata, "DBN", 6)

0 0.0634373465939555 ['base:class']
1 0.0783280144349564 ['base:class', 'gender:Number Tested']
2 0.08911074552180187 ['base:class', 'gender:Number Tested', 'base:cbrTotal Parent Response Rate (%)']
3 0.09230259635298388 ['base:class', 'gender:Number Tested', 'base:cbrTotal Parent Response Rate (%)', 'base:cbrTotal Teacher Response Rate (%)']
4 0.09235922087700488 ['base:class', 'gender:Number Tested', 'base:cbrTotal Parent Response Rate (%)', 'base:cbrTotal Teacher Response Rate (%)', 'base:sqTotal Teacher Response Rate (%)']
5 367572176331.9454 ['base:class', 'gender:Number Tested', 'base:cbrTotal Parent Response Rate (%)', 'base:cbrTotal Teacher Response Rate (%)', 'base:sqTotal Teacher Response Rate (%)', 'base:sqclass']




In [96]:
join_test = connect(gender_test, mathdata, ("DBN","Grade"))
join_train = connect(gender_train, mathdata, ("DBN","Grade"))
join_train_cov = join_train.sum()
join_test_cov = join_test.sum()
parameter = linear_regression(join_train_cov,['math:Level 4 %', 'math:cbrLevel 1 %', 'math:logYear', 'math:cbrYear', 'math:sqLevel 2 #'], gender_test.name + ":" + gender_test.y)
cur_r2 = r2(join_train_cov, ['math:Level 4 %', 'math:cbrLevel 1 %', 'math:logYear', 'math:cbrYear', 'math:sqLevel 2 #'], gender_test.name + ":" + gender_test.y, parameter)
cur_r2



In [101]:
%%time
join_test = connect(gender_test, crimedata, "DBN")
join_train = connect(gender_train, crimedata, "DBN")
# join_train_cov = join_train.sum()
# join_test_cov = join_test.sum()
# parameter = linear_regression(join_train_cov,['math:Level 4 %', 'math:cbrLevel 1 %', 'math:logYear', 'math:cbrYear', 'math:sqLevel 2 #'], gender_test.name + ":" + gender_test.y)
# cur_r2 = r2(join_train_cov, ['math:Level 4 %', 'math:cbrLevel 1 %', 'math:logYear', 'math:cbrYear', 'math:sqLevel 2 #'], gender_test.name + ":" + gender_test.y, parameter)
# cur_r2

CPU times: user 11.1 ms, sys: 5.6 ms, total: 16.7 ms
Wall time: 13.6 ms




In [535]:
parameter * [1.44816449e+04, 1.30143190e+06, 5.38886889e+04, 6.20981809e+04,  9.69013908e+07, 1.44816449e+04, 1.44816449e+04]

array([ 2.41940268e+19,  3.16586369e+04,  2.40288804e+05,  4.10071360e+06,
       -8.07414166e+05, -2.41940268e+19,  8.11960236e+05])

In [550]:
square_error(join_test_cov, ['base:class', 'gender:Number Tested', 'base:cbrTotal Parent Response Rate (%)', 'base:cbrTotal Teacher Response Rate (%)', 'base:sqTotal Teacher Response Rate (%)', 'base:sqclass'], gender_test.name + ":" + gender_test.y, parameter)

837081094.0
9.851340018394425e+33 cov:Q:base:class,base:class
9.851340018394438e+33 cov:Q:base:class,gender:Number Tested
9.851340018394536e+33 cov:Q:base:class,base:cbrTotal Parent Response Rate (%)
9.851340018396206e+33 cov:Q:base:class,base:cbrTotal Teacher Response Rate (%)
9.851340018395878e+33 cov:Q:base:class,base:sqTotal Teacher Response Rate (%)
1.3996467065927122e+21 cov:Q:base:class,base:sqclass
1.4126909025472372e+21 cov:Q:gender:Number Tested,base:class
1.4126909025472372e+21 cov:Q:gender:Number Tested,gender:Number Tested
1.4126909025472374e+21 cov:Q:gender:Number Tested,base:cbrTotal Parent Response Rate (%)
1.412690902547243e+21 cov:Q:gender:Number Tested,base:cbrTotal Teacher Response Rate (%)
1.412690902547242e+21 cov:Q:gender:Number Tested,base:sqTotal Teacher Response Rate (%)
1.3996467065927167e+21 cov:Q:gender:Number Tested,base:sqclass
1.4977990279153857e+21 cov:Q:base:cbrTotal Parent Response Rate (%),base:class
1.497799027915386e+21 cov:Q:base:cbrTotal Parent R

-1.1529215059925187e+18

In [557]:
basedata.covariance

cov:s:base:Total Parent Response Rate (%)         55.863585
cov:s:base:logTotal Parent Response Rate (%)       3.883012
cov:s:base:sqTotal Parent Response Rate (%)     3726.582864
cov:s:base:cbrTotal Parent Response Rate (%)       3.715967
cov:s:base:Total Teacher Response Rate (%)        84.722097
                                                   ...     
cov:Q:base:logclass,base:sqclass                   0.000000
cov:Q:base:logclass,base:cbrclass                  0.000000
cov:Q:base:sqclass,base:sqclass                    0.412063
cov:Q:base:sqclass,base:cbrclass                   0.412063
cov:Q:base:cbrclass,base:cbrclass                  0.412063
Length: 91, dtype: float64

In [84]:
%%time
cur_atts, final_r2 = select_features(gender_train, gender_test, crimedata, "DBN",5)
# cur_atts, final_r2 = select_features(gender_train, gender_test, apdata, "DBN", 5)
# cur_atts, final_r2 = select_features(gender_train, gender_test, surveydata, "DBN", 5)
# cur_atts, final_r2 = select_features(gender_train, gender_test, basedata, "DBN", 5)
# cur_atts, final_r2 = select_features(gender_train, gender_test, discdata, "DBN", 5)
# cur_atts, final_r2 = select_features(gender_train, gender_test, mathdata, ("DBN","Grade"), 5)
# cur_atts, final_r2 = select_features(gender_train, gender_test, ossdata, "DBN", 5)
# cur_atts, final_r2 = select_features(gender_train, gender_test, pedata, "DBN", 5)
# cur_atts, final_r2 = select_features(gender_train, gender_test, s2trdata, "DBN", 5)
# cur_atts, final_r2 = select_features(gender_train, gender_test, satdata, "DBN", 5)
# cur_atts, final_r2 = select_features(gender_train, gender_test, prodata, "DBN", 5)
# cur_atts, final_r2 = select_features(gender_train, gender_test, spydata, "Year", 5)
# cur_atts, final_r2 = select_features(gender_train, gender_test, transferdata, "DBN", 5)
# cur_atts, final_r2 = select_features(gender_train, gender_test, yabcdata, "DBN", 5)
# cur_atts, final_r2 = select_features(gender_train, gender_test, dm1data, "DBN", 5)
# cur_atts, final_r2 = select_features(gender_train, gender_test, dm2data, "DBN", 5)
# cur_atts, final_r2 = select_features(gender_train, gender_test, dm3data, "DBN", 5)
# cur_atts, final_r2 = select_features(gender_train, gender_test, dm4data, "DBN", 5)
# cur_atts, final_r2 = select_features(gender_train, gender_test, dm5data, ("DBN","Grade"), 5)

0 0.06182805848629325 ['crime:Register']
1 0.07321133657410761 ['crime:Register', 'crime:ID']
2 0.07648151293567385 ['crime:Register', 'crime:ID', 'crime:# Schools']
3 0.07692918107396829 ['crime:Register', 'crime:ID', 'crime:# Schools', 'gender:Number Tested']
4 0.07710244197152538 ['crime:Register', 'crime:ID', 'crime:# Schools', 'gender:Number Tested', 'crime:Geographical District Code']
CPU times: user 71.8 ms, sys: 5.78 ms, total: 77.6 ms
Wall time: 73.4 ms




In [369]:
basedata.data

Unnamed: 0,cov:s:base:Total Parent Response Rate (%),cov:s:base:logTotal Parent Response Rate (%),cov:s:base:sqTotal Parent Response Rate (%),cov:s:base:cbrTotal Parent Response Rate (%),cov:s:base:Total Teacher Response Rate (%),cov:s:base:logTotal Teacher Response Rate (%),cov:s:base:sqTotal Teacher Response Rate (%),cov:s:base:cbrTotal Teacher Response Rate (%),cov:s:base:class,cov:s:base:logclass,...,"cov:Q:base:class,base:class","cov:Q:base:class,base:logclass","cov:Q:base:class,base:sqclass","cov:Q:base:class,base:cbrclass","cov:Q:base:logclass,base:logclass","cov:Q:base:logclass,base:sqclass","cov:Q:base:logclass,base:cbrclass","cov:Q:base:sqclass,base:sqclass","cov:Q:base:sqclass,base:cbrclass","cov:Q:base:cbrclass,base:cbrclass"
0,66,4.189655,4356,4.041240,100,4.605170,10000,4.641589,False,-inf,...,False,,0,0.0,inf,,,0,0.0,0.0
1,96,4.564348,9216,4.578857,97,4.574711,9409,4.594701,False,-inf,...,False,,0,0.0,inf,,,0,0.0,0.0
2,71,4.262680,5041,4.140818,79,4.369448,6241,4.290840,True,0.0,...,True,0.0,1,1.0,0.0,0.0,0.0,1,1.0,1.0
3,40,3.688879,1600,3.419952,88,4.477337,7744,4.447960,True,0.0,...,True,0.0,1,1.0,0.0,0.0,0.0,1,1.0,1.0
4,70,4.248495,4900,4.121285,100,4.605170,10000,4.641589,False,-inf,...,False,,0,0.0,inf,,,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1769,99,4.595120,9801,4.626065,90,4.499810,8100,4.481405,False,-inf,...,False,,0,0.0,inf,,,0,0.0,0.0
1770,92,4.521789,8464,4.514357,73,4.290459,5329,4.179339,False,-inf,...,False,,0,0.0,inf,,,0,0.0,0.0
1771,77,4.343805,5929,4.254321,100,4.605170,10000,4.641589,False,-inf,...,False,,0,0.0,inf,,,0,0.0,0.0
1772,66,4.189655,4356,4.041240,100,4.605170,10000,4.641589,False,-inf,...,False,,0,0.0,inf,,,0,0.0,0.0


In [354]:
gender_train.X

['gender:Mean Scale Score',
 'gender:Number Tested',
 'crime:logRegister',
 'crime:sqID',
 'crime:ID',
 'crime:sqRegister',
 'crime:cbrID']

In [285]:
regr = MLPRegressor(random_state=1, max_iter=500).fit(X_train, y_train)
regr.predict(X_test[:2])

already absorbed this data




In [4]:
gender = pd.read_csv("gender.csv")

In [10]:
gender["Mean Scale Score"] = pd.to_numeric(gender["Mean Scale Score"],errors="coerce")

In [11]:
gender_processed = gender[~np.isnan(gender["Mean Scale Score"])][["DBN","School Name","Grade","Year","Category","Number Tested","Mean Scale Score"]]

In [85]:
crime

Unnamed: 0,ID,Building Code,DBN,Location Name,Location Code,Address,Borough,Geographical District Code,Register,Building Name,...,AvgOfOth N,AvgOfNoCrim N,AvgOfProp N,AvgOfVio N,New Georeferenced Column,Zip Codes,Community Districts,Borough Boundaries,City Council Districts,Police Precincts
0,288,K247,20K247,PS 247(BROOKLYN),K247,7000 21st AVENUE,K,20,883.0,,...,1.52,3.88,0.83,0.91,POINT (-76.974907 38.974981),10889,,,,
1,335,K281,21K281,IS 281(BS),K281,8787 24th AVENUE,K,21,1342.0,,...,2.79,6.71,1.48,1.47,POINT (-104.885878 39.751528),20975,,,,
2,359,K298,23K298,PS 298(BROOKLYN),K298,85 WATKINS STREET,K,23,205.0,,...,0.88,1.69,0.57,0.45,POINT (-73.908193 40.670936),17614,55.0,2.0,37.0,46.0
3,390,K318,14K318,JHS 318(BN),K318,101 WALTON STREET,K,14,1230.0,,...,2.55,6.10,1.29,1.48,POINT (-79.670906 34.623679),27087,,,,
4,400,K327,23K327,PS 327(BROOKLYN),K327,111 BRISTOL STREET,K,23,299.0,,...,1.03,2.22,0.55,0.58,POINT (-72.931594 41.315558),13460,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1914,1766,X174,,456 WHITE PLAINS ROAD CONSOLIDATED LOCATION,,456 WHITE PLAINS ROAD,X,8,1077.0,456 WHITE PLAINS ROAD CONSOLIDATED LOCATION,...,2.55,6.10,1.29,1.48,POINT (-81.729256 35.006166),23236,,,,
1915,1723,X149,07X221,SOUTH BRONX PREPARATORY:A COLLEGE BOARD SCHOOL...,X221,360 EAST 145 STREET,X,7,578.0,360 EAST 145 STREET CONSOLIDATED LOCATION,...,,,,,POINT (-73.232997 42.311995),12142,,,,
1916,1632,X100,08X100,PS 100(XE),X100,800 TAYLOR AVENUE,X,8,563.0,,...,1.13,2.41,0.66,0.57,POINT (-73.863098 40.821837),11611,58.0,5.0,31.0,26.0
1917,1499,X002,,1363 FULTON AVENUE CONSOLIDATED LOCATION,,1363 FULTON AVENUE,X,9,688.0,1363 FULTON AVENUE CONSOLIDATED LOCATION,...,1.13,2.41,0.66,0.57,POINT (-73.902403 40.833741),10934,34.0,5.0,42.0,25.0


In [12]:
join = pd.merge(gender_processed, pd.read_csv("crime.csv"), how='inner', left_on = 'DBN', right_on = 'DBN')
join

Unnamed: 0,DBN,School Name,Grade,Year,Category,Number Tested,Mean Scale Score,ID,Building Code,Location Name,...,AvgOfOth N,AvgOfNoCrim N,AvgOfProp N,AvgOfVio N,New Georeferenced Column,Zip Codes,Community Districts,Borough Boundaries,City Council Districts,Police Precincts
0,01M015,PS 015 ROBERTO CLEMENTE,3,2013,Female,12,285.0,641,M015,PS 15 ROBERTO CLEMENTE (MANHATTAN),...,0.88,1.69,0.57,0.45,POINT (-84.125408 44.656045),12819,,,,
1,01M015,PS 015 ROBERTO CLEMENTE,3,2013,Male,15,292.0,641,M015,PS 15 ROBERTO CLEMENTE (MANHATTAN),...,0.88,1.69,0.57,0.45,POINT (-84.125408 44.656045),12819,,,,
2,01M015,PS 015 ROBERTO CLEMENTE,3,2015,Female,7,289.0,641,M015,PS 15 ROBERTO CLEMENTE (MANHATTAN),...,0.88,1.69,0.57,0.45,POINT (-84.125408 44.656045),12819,,,,
3,01M015,PS 015 ROBERTO CLEMENTE,3,2015,Male,9,276.0,641,M015,PS 15 ROBERTO CLEMENTE (MANHATTAN),...,0.88,1.69,0.57,0.45,POINT (-84.125408 44.656045),12819,,,,
4,01M015,PS 015 ROBERTO CLEMENTE,3,2017,Female,13,308.0,641,M015,PS 15 ROBERTO CLEMENTE (MANHATTAN),...,0.88,1.69,0.57,0.45,POINT (-84.125408 44.656045),12819,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43049,32K562,EVERGREEN MIDDLE SCHOOL FOR URBAN EXPLORATION,All Grades,2015,Male,161,274.0,356,K296,EVERGREEN MIDDLE SCHOOL FOR URBAN EXPLORATION(BN),...,,,,,POINT (-73.909921 40.688733),17214,42.0,2.0,37.0,53.0
43050,32K562,EVERGREEN MIDDLE SCHOOL FOR URBAN EXPLORATION,All Grades,2016,Female,125,286.0,356,K296,EVERGREEN MIDDLE SCHOOL FOR URBAN EXPLORATION(BN),...,,,,,POINT (-73.909921 40.688733),17214,42.0,2.0,37.0,53.0
43051,32K562,EVERGREEN MIDDLE SCHOOL FOR URBAN EXPLORATION,All Grades,2016,Male,141,282.0,356,K296,EVERGREEN MIDDLE SCHOOL FOR URBAN EXPLORATION(BN),...,,,,,POINT (-73.909921 40.688733),17214,42.0,2.0,37.0,53.0
43052,32K562,EVERGREEN MIDDLE SCHOOL FOR URBAN EXPLORATION,All Grades,2017,Female,138,289.0,356,K296,EVERGREEN MIDDLE SCHOOL FOR URBAN EXPLORATION(BN),...,,,,,POINT (-73.909921 40.688733),17214,42.0,2.0,37.0,53.0


In [91]:
%%time
kept = []
for att in join.columns:
    if att == 'Mean Scale Score':
        continue
#     join[att], k = try_to_numeric(join[att],0.6) 
#     mean_value = num_col.mean()
    join[att].fillna(value=0, inplace=True)
    if k:
        kept.append(att)

CPU times: user 47.6 ms, sys: 7.3 ms, total: 54.9 ms
Wall time: 50.7 ms


In [14]:
kept

['Grade',
 'Year',
 'Number Tested',
 'ID',
 'Geographical District Code',
 'Register',
 '# Schools',
 'Major N',
 'Oth N',
 'NoCrim N',
 'Prop N',
 'Vio N',
 'AvgOfMajor N',
 'AvgOfOth N',
 'AvgOfNoCrim N',
 'AvgOfProp N',
 'AvgOfVio N',
 'Zip Codes',
 'Community Districts',
 'Borough Boundaries',
 'City Council Districts',
 'Police Precincts']

In [29]:
%%time
# regr = MLPRegressor().fit(join[kept], join["Mean Scale Score"])
r2_score(join["Mean Scale Score"],regr.predict(join[kept]))

CPU times: user 59.8 ms, sys: 72.1 ms, total: 132 ms
Wall time: 43.9 ms


-0.5801590393520977

In [77]:


def try_to_numeric(col, nan_rate):
    num_col = pd.to_numeric(col,errors="coerce")
    nan_count = sum(np.isnan(num_col))
    if nan_count/len(col) < nan_rate:
#         mean_value = num_col.mean()
        num_col.fillna(value=mean_value, inplace=True)
        return num_col, True
    else:
        return col.astype("category"), False

In [94]:
%%time
c = pd.read_csv("crime.csv")

CPU times: user 22.3 ms, sys: 1.11 ms, total: 23.4 ms
Wall time: 21.4 ms


In [95]:
%%time
pd.merge(gender_processed, c, how='left', left_on = 'DBN', right_on = 'DBN')

CPU times: user 47.8 ms, sys: 0 ns, total: 47.8 ms
Wall time: 45.3 ms


Unnamed: 0,DBN,School Name,Grade,Year,Category,Number Tested,Mean Scale Score,ID,Building Code,Location Name,...,AvgOfOth N,AvgOfNoCrim N,AvgOfProp N,AvgOfVio N,New Georeferenced Column,Zip Codes,Community Districts,Borough Boundaries,City Council Districts,Police Precincts
0,01M015,PS 015 ROBERTO CLEMENTE,3,2013,Female,12,285.0,641.0,M015,PS 15 ROBERTO CLEMENTE (MANHATTAN),...,0.88,1.69,0.57,0.45,POINT (-84.125408 44.656045),12819.0,,,,
1,01M015,PS 015 ROBERTO CLEMENTE,3,2013,Male,15,292.0,641.0,M015,PS 15 ROBERTO CLEMENTE (MANHATTAN),...,0.88,1.69,0.57,0.45,POINT (-84.125408 44.656045),12819.0,,,,
2,01M015,PS 015 ROBERTO CLEMENTE,3,2015,Female,7,289.0,641.0,M015,PS 15 ROBERTO CLEMENTE (MANHATTAN),...,0.88,1.69,0.57,0.45,POINT (-84.125408 44.656045),12819.0,,,,
3,01M015,PS 015 ROBERTO CLEMENTE,3,2015,Male,9,276.0,641.0,M015,PS 15 ROBERTO CLEMENTE (MANHATTAN),...,0.88,1.69,0.57,0.45,POINT (-84.125408 44.656045),12819.0,,,,
4,01M015,PS 015 ROBERTO CLEMENTE,3,2017,Female,13,308.0,641.0,M015,PS 15 ROBERTO CLEMENTE (MANHATTAN),...,0.88,1.69,0.57,0.45,POINT (-84.125408 44.656045),12819.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47265,32K562,EVERGREEN MIDDLE SCHOOL FOR URBAN EXPLORATION,All Grades,2015,Male,161,274.0,356.0,K296,EVERGREEN MIDDLE SCHOOL FOR URBAN EXPLORATION(BN),...,,,,,POINT (-73.909921 40.688733),17214.0,42.0,2.0,37.0,53.0
47266,32K562,EVERGREEN MIDDLE SCHOOL FOR URBAN EXPLORATION,All Grades,2016,Female,125,286.0,356.0,K296,EVERGREEN MIDDLE SCHOOL FOR URBAN EXPLORATION(BN),...,,,,,POINT (-73.909921 40.688733),17214.0,42.0,2.0,37.0,53.0
47267,32K562,EVERGREEN MIDDLE SCHOOL FOR URBAN EXPLORATION,All Grades,2016,Male,141,282.0,356.0,K296,EVERGREEN MIDDLE SCHOOL FOR URBAN EXPLORATION(BN),...,,,,,POINT (-73.909921 40.688733),17214.0,42.0,2.0,37.0,53.0
47268,32K562,EVERGREEN MIDDLE SCHOOL FOR URBAN EXPLORATION,All Grades,2017,Female,138,289.0,356.0,K296,EVERGREEN MIDDLE SCHOOL FOR URBAN EXPLORATION(BN),...,,,,,POINT (-73.909921 40.688733),17214.0,42.0,2.0,37.0,53.0


In [79]:
%%time
join = pd.merge(gender_processed, pd.read_csv("crime.csv"), how='left', left_on = 'DBN', right_on = 'DBN')

kept = []
for att in join.columns:
    if att == 'Mean Scale Score':
        continue
    join[att], k = try_to_numeric(join[att],0.6) 
    if k:
        kept.append(att)

CPU times: user 63.9 ms, sys: 0 ns, total: 63.9 ms
Wall time: 61.6 ms


In [35]:
clf = RandomForestClassifier(max_depth=5, random_state=0).fit(join[kept], join["Mean Scale Score"])
r2_score(join["Mean Scale Score"],clf.predict(join[kept]))

CPU times: user 5.26 s, sys: 985 ms, total: 6.24 s
Wall time: 6.24 s


0.31803782020694804

In [72]:
%%time
reg = LinearRegression().fit(join[kept], join["Mean Scale Score"])
r2_score(join["Mean Scale Score"],reg.predict(join[kept]))

CPU times: user 77.8 ms, sys: 34.8 ms, total: 113 ms
Wall time: 45.4 ms


0.19310387826597264

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPRegressor

In [None]:
Types of aggregation (sum, join many times?).
Types of transformations (also apply to y?).
How to impute?