In [1]:
import csv
import numpy as np

def import_data(filename):
    raw_data = open(filename, 'rt')
    reader = csv.reader(raw_data, delimiter=',', quoting=csv.QUOTE_NONE)
    x = list(reader)
    data = np.array(x[1:])
    #data = np.array(x[1:]).astype('float')
    #print(data.shape)
    labels = np.array(x[0])
    #print(labels)
    X = data[:,:-1]
    #print(X.shape)
    y = data[:,-1].astype('float')
    #y = np.reshape(y,(y.shape[0],1))
    #print(y.shape)
    return labels,X,y

def square_error(a,b):
    err = (a-b)**2
    err_avg = err/err.shape[0]
    return np.sum(err_avg)

def R2_score(a,b):
    b_avg = np.sum(b)/b.shape[0]
    return 1 - (np.sum((a-b)**2)/np.sum((b-b_avg)**2))

labels, X, y = import_data("BlackFriday.csv")
labels = labels[2:]
X = X[:,2:]

In [2]:
#Processing the data

#print(labels)

#find unique values in each column
#unique_values = [list(sorted(set(X[:,i]))) for i in range(X.shape[1])]
#print(unique_values)

for i in range(X.shape[0]):
    #convert M to 0 and F to 1
    if(X[i][0] == 'M'):
        X[i][0] = 0
    else:
        X[i][0] = 1
        
    #convert age ranges to classes
    if(X[i][1] == '0-17'):
        X[i][1] = 8.5
    elif(X[i][1] == '18-25'):
        X[i][1] = 21.5
    elif(X[i][1] == '26-35'):
        X[i][1] = 30.5
    elif(X[i][1] == '36-45'):
        X[i][1] = 40.5
    elif(X[i][1] == '46-50'):
        X[i][1] = 48
    elif(X[i][1] == '51-55'):
        X[i][1] = 53
    else:
        X[i][1] = 78
        
    #convert city category to classes
    if(X[i][3] == 'A'):
        X[i][3] = 0
    elif(X[i][3] == 'B'):
        X[i][3] = 1
    elif(X[i][3] == 'C'):
        X[i][3] = 2
        
    #convert occupation, marital status, stay in current city years and product categories to integers
    X[i][2] = int(X[i][2])
    
    X[i][5] = int(X[i][5])
    
    if(X[i][4] == '4+'):
        X[i][4] = 4
    else:
        X[i][4] = int(X[i][4])
    
    for j in range(6,9):
        if(X[i][j] == ''):
            X[i][j] = 0
        else:
            X[i][j] = int(X[i][j])

X = X.astype('float')

In [3]:
# test-train split
from sklearn.model_selection import train_test_split

tr_X, tst_X, tr_y, tst_y = train_test_split(X, y, test_size=0.2)

#num_data = X.shape[0]
#perm = np.random.permutation(num_data)

#tr_X, tr_y = X[perm[:int(0.75*num_data)],:], y[perm[:int(0.75*num_data)]]
#tst_X, tst_y = X[perm[int(0.75*num_data):],:], y[perm[int(0.75*num_data):]]

#scale = np.max(tr_y)
#tr_y = tr_y/scale
#print(tr_X.shape, tr_y.shape, tst_X.shape, tst_y.shape)

In [18]:
#lasso regression (l_1 regularizer)
from sklearn.linear_model import LassoCV

lasso_reg = LassoCV(cv=5)
lasso_reg = lasso_reg.fit(tr_X,tr_y)

y_pred = lasso_reg.predict(tst_X)

# Uncomment to print the square error
#err = square_error(y_pred,tst_y)
#print(err)

#Uncomment the following line to print the R2 score
print(lasso_reg.score(tst_X,tst_y))

21399302.005880576
0.13816200741003515


In [19]:
#ridge regression (l_2 regularizer)
from sklearn.linear_model import RidgeCV

ridge_reg = RidgeCV(alphas = [1000/1.1**i for i in range(30)])
ridge_reg = ridge_reg.fit(tr_X,tr_y)

y_pred = ridge_reg.predict(tst_X)

# Uncomment to print the square error
#err = square_error(y_pred,tst_y)
#print(err)

#Uncomment the following line to print the R2 score
print(ridge_reg.score(tst_X,tst_y))

21397990.545834295
0.13821482530537266


In [20]:
#random forests regression
from sklearn.ensemble import RandomForestRegressor

rnd_forest = RandomForestRegressor(n_estimators=100)
rnd_forest = rnd_forest.fit(tr_X,tr_y)
y_pred = rnd_forest.predict(tst_X)

# Uncomment to print the square error
#err = square_error(y_pred,tst_y)
#print(err)

#Uncomment the following line to print the R2 score
print(rnd_forest.score(tst_X,tst_y))

9326014.609742649
0.6244029965128506


In [21]:
#k-means and averaging to do the predictions.
from sklearn.cluster import KMeans

num_clusters = 128
tr_y_clusters = np.zeros((tr_y.shape[0],num_clusters)) 

kmeans = KMeans(n_clusters=num_clusters)
kmeans = kmeans.fit(tr_X)

cluster_labels = kmeans.predict(tr_X)

for i in range(tr_y.shape[0]):
    tr_y_clusters[i][cluster_labels[i]] = tr_y[i]
    
cluster_avg = [np.sum(tr_y_clusters[:,i])/np.count_nonzero(tr_y_clusters[:,i]) for i in range(num_clusters)]

cluster_var = [np.sum([(x-cluster_avg[i])**2 for x in tr_y_clusters[:,i] if x != 0])/np.count_nonzero(tr_y_clusters[:,i]) for i in range(num_clusters)]

#print(cluster_avg)
#print(max(cluster_var))

tst_cluster = kmeans.predict(tst_X)
y_pred = [cluster_avg[tst_cluster[i]] for i in range(tst_X.shape[0])]

# Uncomment to print the square error
#err = square_error(y_pred,tst_y)
#print(err)

# Uncomment to print the R2 score
r2score = R2_score(y_pred,tst_y)
print(r2score)

20377827.669578332
0.1793009843373654


In [22]:
#mini-batch Kmeans with averaging
from sklearn.cluster import MiniBatchKMeans

num_clusters = 128
tr_y_clusters = np.zeros((tr_y.shape[0],num_clusters)) 

m_kmeans = MiniBatchKMeans(n_clusters=num_clusters)
m_kmeans = m_kmeans.fit(tr_X)

cluster_labels = m_kmeans.predict(tr_X)

for i in range(tr_y.shape[0]):
    tr_y_clusters[i][cluster_labels[i]] = tr_y[i]
    
cluster_avg = [np.sum(tr_y_clusters[:,i])/np.count_nonzero(tr_y_clusters[:,i]) for i in range(num_clusters)]

cluster_var = [np.sum([(x-cluster_avg[i])**2 for x in tr_y_clusters[:,i] if x != 0])/np.count_nonzero(tr_y_clusters[:,i]) for i in range(num_clusters)]

#print(cluster_avg)
#print(max(cluster_var))

tst_cluster = m_kmeans.predict(tst_X)
y_pred = [cluster_avg[tst_cluster[i]] for i in range(tst_X.shape[0])]

# Uncomment to print the square error
#err = square_error(y_pred,tst_y)
#print(err)

# Uncomment to print the R2 score
r2score = R2_score(y_pred,tst_y)
print(r2score)

20192987.21506892
0.18674527042763245


In [23]:
#k-nearest neighbour regression
from sklearn.neighbors import KNeighborsRegressor

knear_reg = KNeighborsRegressor(n_neighbors=5)
knear_reg = knear_reg.fit(tr_X,tr_y)

y_pred = knear_reg.predict(tst_X)

# Uncomment to print the square error
#err = square_error(y_pred,tst_y)
#print(err)

# Uncomment to print the R2 score
print(knear_reg.score(tst_X,tst_y))

10835893.333951116
0.5635939641262555


In [5]:
#k-means to cluster the prediction data
from sklearn.cluster import KMeans

num_clusters = 256
tr_y_clusters = np.zeros((tr_y.shape[0],num_clusters)) 

kmeans = KMeans(n_clusters=num_clusters)
kmeans = kmeans.fit(np.reshape(tr_y,(-1,1)))

cluster_labels = kmeans.predict(np.reshape(tr_y,(-1,1)))

for i in range(tr_y.shape[0]):
    tr_y_clusters[i][cluster_labels[i]] = tr_y[i]
    
cluster_avg = [np.sum(tr_y_clusters[:,i])/np.count_nonzero(tr_y_clusters[:,i]) for i in range(num_clusters)]

cluster_var = [np.sum([(x-cluster_avg[i])**2 for x in tr_y_clusters[:,i] if x != 0])/np.count_nonzero(tr_y_clusters[:,i]) for i in range(num_clusters)]

#print(cluster_avg)
#print(max(cluster_var))

#decision tree classifier
from sklearn.ensemble import RandomForestClassifier

rf_cls = RandomForestClassifier(criterion = 'entropy')
rf_cls = rf_cls.fit(tr_X,cluster_labels)
y_pred_cls = rf_cls.predict(tst_X)
y_pred = [cluster_avg[y_pred_cls[i]] for i in range(tst_X.shape[0])]

# Uncomment to print the square error
#err = square_error(y_pred,tst_y)
#print(err)

# Uncomment to print the R2 score
r2score = R2_score(y_pred,tst_y)
print(r2score)

7942.7875




15189160.262432195
0.3891933175977019


In [9]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(solver='sag', max_iter = 75, multi_class='multinomial')
log_reg = log_reg.fit(tr_X,cluster_labels)
pred = log_reg.predict_proba(tst_X)

y_pred_cls = [np.argmax(pred[i]) for i in range(tst_X.shape[0])]
y_pred = [cluster_avg[y_pred_cls[i]] for i in range(tst_X.shape[0])]

# Uncomment to print the square error
#err = square_error(y_pred,tst_y)
#print(err)

# Uncomment to print the R2 score
r2score = R2_score(y_pred,tst_y)
print(r2score)



23198954.55685636
0.06709283309608782
