In [1]:
import pandas as pd
import numpy as np
import random
import statistics
import math
from dateutil import parser
from datetime import datetime
from functools import reduce
import random
from random import choices
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
def normalize(cov):
    cols = []
    
    if isinstance(cov, pd.DataFrame):
        cols = cov.columns
    else:
        cols = list(cov.axes[0])
        
    for col in cols:
        if col != 'cov:c':
            cov[col] = cov[col]/cov['cov:c']
    cov['cov:c'] = 1
    return cov

class agg_dataset:
    def set_meta(self, data, X, dimensions, name):
        self.data = data
        self.dimensions = dimensions
        self.X = X
        self.name = name
        
    def load_buyer(self, data, X, y, dimensions, name):
        self.data = data
        self.dimensions = dimensions
        self.X = X
        self.y = y
        self.name = name
        
        # don't impute y
        self.to_numeric(self.y, False)
        for x in self.X:
            self.to_numeric(x, False)
        
        self.X =  [self.y] + self.X
        
        dedup_dimensions = set()
        for d in dimensions:
            if isinstance(d, list):
                dedup_dimensions.update(d)
            else:
                dedup_dimensions.add(d)
        dedup_dimensions = list(dedup_dimensions)
        
        # project out attributes except x, y, dim
        self.data = self.data[self.X  + dedup_dimensions]
        
        
    
    def compute_agg(self, norm = False):
        self.lift(self.name, self.X)
        
        self.agg_dimensions = dict()
        
        for d in self.dimensions:
            if isinstance(d, list):
                self.agg_dimensions[tuple(d)] = self.data[list(filter(lambda col: col.startswith("cov:"), self.data.columns)) + d].groupby(d).sum()
            else:
                self.agg_dimensions[d] = self.data[list(filter(lambda col: col.startswith("cov:"), self.data.columns)) + [d]].groupby(d).sum()
            
        if norm:
            for d in self.agg_dimensions.keys():
                self.agg_dimensions[d] = normalize(self.agg_dimensions[d])
            
        self.covariance = normalize(self.data[list(filter(lambda col: col.startswith("cov:"), self.data.columns))].sum())
    
    def load_seller(self, data, dimensions, name):
        self.data = data
        self.dimensions = dimensions
        self.name = name
        
                
        dedup_dimensions = set()
        for d in dimensions:
            if isinstance(d, list):
                dedup_dimensions.update(d)
            else:
                dedup_dimensions.add(d)
        dedup_dimensions = list(dedup_dimensions)
        
        
        # find numeric attributes as features
        atts = []
        for att in self.data.columns:
            if att in dedup_dimensions:
                continue
            cond, col = self.is_numeric(att, 0.4, 2)
            if cond:
                self.data[att] = col
                self.data["log" + att] = np.log(self.data[att])
                self.data["sq" + att] = np.square(self.data[att])
                self.data["cbr" + att] = np.cbrt(self.data[att])
                atts.append(att)
                atts.append("log" + att)
                atts.append("sq" + att)
                atts.append("cbr" + att)

        self.X = atts

        
        # project out attributes except x, y, dim
        self.data = self.data[self.X + dedup_dimensions]

    def is_numeric(self, att, impute_rate, cardinality):
        col = pd.to_numeric(self.data[att],errors="coerce")
        nan_count = sum(np.isnan(col))
        unique_count = len(col.unique())
        if nan_count/len(self.data) < impute_rate and unique_count/len(self.data) < cardinality:
            mean_value = col.mean()
            col.fillna(value=mean_value, inplace=True)
            return True, col
        else:
            return False, col
    
    def to_numeric(self, att, impute=True, impute_rate = 1):
        # parse attribute to numeric
        self.data[att] = pd.to_numeric(self.data[att],errors="coerce")
        # count the number of nan
        nan_count = sum(np.isnan(self.data[att]))
        
        if impute:
            # impute error only if missing rate is not above threshold
            if nan_count/len(self.data) < impute_rate:
                mean_value=self.data[att].mean()
                self.data[att].fillna(value=mean_value, inplace=True)
                return True
            else:
                return False
        else:
            # else, remove records with missing value
            self.data = self.data[~np.isnan(self.data[att])]
    
    def lift(self, tablename, attributes):
        self.data['cov:c'] = 1

        for i in range(len(attributes)):
            for j in range(i, len(attributes)):
                self.data['cov:Q:' + tablename + ":" + attributes[i] + ","+ tablename + ":" + attributes[j]] = self.data[attributes[i]] * self.data[attributes[j]]

        for attribute in attributes:
            self.data= self.data.rename(columns = {attribute:'cov:s:' + tablename + ":" + attribute})
            
# return the coefficients of features and a constant 
def linear_regression(cov_matrix, features, result):
    a = np.empty([len(features) + 1, len(features) + 1])
    b = np.empty(len(features) + 1)
    
    for i in range(len(features)):
        for j in range(len(features)):
            if 'cov:Q:' + features[i] + ","+ features[j] in cov_matrix:
                a[i][j] = cov_matrix['cov:Q:' + features[i] + ","+ features[j]]
            else:
                a[i][j] = cov_matrix['cov:Q:' + features[j] + ","+ features[i]]
    
    for i in range(len(features)):
        a[i][len(features)] = cov_matrix['cov:s:' + features[i]]
        a[len(features)][i] = cov_matrix['cov:s:' + features[i]]
        if 'cov:Q:' + result + "," + features[i] in cov_matrix:
            b[i] = cov_matrix['cov:Q:' + result + "," + features[i]]
        else:
            b[i] = cov_matrix['cov:Q:' + features[i] + "," + result]
    
    b[len(features)] = cov_matrix['cov:s:' + result]
    
    a[len(features)][len(features)] = cov_matrix['cov:c']
    print(a,b)
    return np.linalg.solve(a, b)

def square_error(cov_matrix, features, result, parameter):
    se = cov_matrix['cov:Q:'  + result + "," + result]
    
    for i in range(len(features)):
        for j in range(len(features)):
            if 'cov:Q:'  + features[i] + "," + features[j] in cov_matrix:
                se += parameter[min(i,j)] * parameter[max(i,j)] * cov_matrix['cov:Q:'  + features[i] + "," + features[j]]
            else:    
                se += parameter[min(i,j)] * parameter[max(i,j)] * cov_matrix['cov:Q:'  + features[j] + "," + features[i]]
    
    for i in range(len(features)):
        se += 2 * parameter[i] * parameter[-1] * cov_matrix['cov:s:'  + features[i]]
        if 'cov:Q:' + result + "," + features[i] in cov_matrix:
            se -= 2 * parameter[i] *  cov_matrix['cov:Q:' + result + "," + features[i]]
        else:
            se -= 2 * parameter[i] *  cov_matrix['cov:Q:' + features[i] + "," + result]

    se -= 2 * parameter[-1] * cov_matrix['cov:s:'  + result]
    se += cov_matrix['cov:c'] * parameter[-1] * parameter[-1]

    return se

def total_sum_of_square(cov_matrix, result):
    return cov_matrix['cov:Q:'  + result + "," + result] - cov_matrix['cov:s:'  + result] * cov_matrix['cov:s:'  + result] / cov_matrix['cov:c']

def mean_squared_error(cov_matrix, features, result, parameter):
    return square_error(cov_matrix, features, result, parameter)/cov_matrix['cov:c']


def r2(cov_matrix, features, result, parameter):
    return 1 - square_error(cov_matrix, features, result, parameter)/total_sum_of_square(cov_matrix, result)

def adjusted_r2(cov_matrix, features, result, parameter):
    return 1 - (cov_matrix['cov:c']-1)*(1 - r2(cov_matrix, features, result, parameter))/(cov_matrix['cov:c'] - len(parameter) - 1)

def connect(aggdata1, aggdata2, dimension, left_inp = False):
    if isinstance(dimension, list):
        dimension = tuple(dimension)
    
    if left_inp:
        agg1 = aggdata1.data
    else:
        agg1 = aggdata1.agg_dimensions[dimension]
    agg2 = aggdata2.agg_dimensions[dimension]
    
    if left_inp:
        join = pd.merge(agg1, agg2, how='left', left_on=dimension, right_index=True)
    else:
        join = pd.merge(agg1, agg2, how='left', left_index=True, right_index=True)
#         join = pd.merge(agg1, agg2, how='inner', left_index=True, right_index=True)
    join = join.drop('cov:c_y', 1)
    join = join.rename(columns = {'cov:c_x':'cov:c'})
    
    left_attributes = aggdata1.X
    left_tablename = aggdata1.name
    right_attributes = aggdata2.X
    right_tablename = aggdata2.name
    
    right_cov = aggdata2.covariance
    
    # fill in nan
    for att2 in right_attributes:
        join['cov:s:' + right_tablename + ":" + att2].fillna(value=right_cov['cov:s:' + right_tablename + ":" + att2], inplace=True)
        join['cov:s:' + right_tablename + ":" + att2] *= join['cov:c']
    
    for i in range(len(right_attributes)):
        for j in range(i, len(right_attributes)):
            join['cov:Q:' + right_tablename + ":" + right_attributes[i] + ","+ right_tablename + ":" + right_attributes[j]].fillna(value=right_cov['cov:Q:' + right_tablename + ":" + right_attributes[i] + ","+ right_tablename + ":" + right_attributes[j]], inplace=True)
            join['cov:Q:' + right_tablename + ":" + right_attributes[i] + ","+ right_tablename + ":" + right_attributes[j]] *= join['cov:c']
    
    
    for att1 in left_attributes:
        for att2 in right_attributes:
            join['cov:Q:' + left_tablename + ":" + att1 + ","+ right_tablename + ":" + att2] = join['cov:s:' + left_tablename + ":" + att1] * join['cov:s:' + right_tablename + ":" + att2]/join['cov:c']
    
    return join



In [4]:
gender = pd.read_csv("gender.csv")
gender_train = agg_dataset()
gender_test = agg_dataset()
gender_train.load_buyer(gender, ["Number Tested"], "Mean Scale Score", ["DBN", ["DBN","Grade"], "School Name", "Year", "Category"], "gender")
msk = np.random.rand(len(gender_train.data)) < 0.8
gender_test.set_meta(gender_train.data[~msk], ["Mean Scale Score", "Number Tested"], ["DBN", ["DBN","Grade"], "School Name", "Year", "Category"], "gender")
gender_train.data = gender_train.data[msk]
gender_test.compute_agg()
gender_train.compute_agg()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
