In [93]:
import numpy
from urllib.request import urlopen
import scipy.optimize
import random
import ast
from sklearn.decomposition import PCA
from collections import defaultdict
import networkx as nx
import matplotlib.pyplot as plt
from math import exp
from math import log

### Question 1

In [5]:
def parseData(fname):
  for l in open(fname):
    yield ast.literal_eval(l)

In [6]:
print("Reading data...")
data = list(parseData("beer_50000.json"))
print("done")

Reading data...
done


In [8]:
def feature(datum):
  feat = [1, datum['review/taste'], datum['review/appearance'], datum['review/aroma'], datum['review/palate'], datum['review/overall']]
  return feat

X = [feature(d) for d in data]
y = [d['beer/ABV'] >= 6.5 for d in data]

In [9]:
def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
  return 1.0 / (1 + exp(-x))


In [52]:
##################################################
# Logistic regression by gradient ascent         #
##################################################

# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
  loglikelihood = 0
  for i in range(len(X)):
    logit = inner(X[i], theta)
    loglikelihood -= log(1 + exp(-logit))
    if not y[i]:
      loglikelihood -= logit
  for k in range(len(theta)):
    loglikelihood -= lam * theta[k]*theta[k]
  # for debugging
  # print("ll =" + str(loglikelihood))
  return -loglikelihood

# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
  dl = [0]*len(theta)
  for i in range(len(X)):
    logit = inner(X[i], theta)
    for k in range(len(theta)):
      dl[k] += X[i][k] * (1 - sigmoid(logit))
      if not y[i]:
        dl[k] -= X[i][k]
  for k in range(len(theta)):
    dl[k] -= lam*2*theta[k]
  return numpy.array([-x for x in dl])

ind_list = [i for i in range(len(X))]
random.shuffle(ind_list)
random_X = []
random_y = []

for i in ind_list:
    random_X.append(X[i])
    random_y.append(y[i])

X_train = random_X[:len(random_X)//3]
y_train = random_y[:len(random_y)//3]

X_test = random_X[len(random_X)//3 + 1:2*len(random_X)//3]
y_test = random_y[len(random_y)//3 + 1:2*len(random_y)//3]

X_val = random_X[2*len(random_X)//3:]
y_val = random_y[2*len(random_y)//3:]

In [56]:
##################################################
# Train                                          #
##################################################

def train(lam):
  theta,_,_ = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, pgtol = 10, args = (X_train, y_train, lam))
  return theta

In [73]:
##################################################
# Predict                                        #
##################################################

def performance(theta, X_test, y_test):
  scores = [inner(theta,x) for x in X_test]
  predictions = [s > 0 for s in scores]
  correct = [(a==b) for (a,b) in zip(predictions,y_test)]
  acc = sum(correct) * 1.0 / len(correct)
  return predictions, acc, scores

In [84]:
##################################################
# Validation pipeline                            #
##################################################

lam = 1.0

theta = train(lam)
train_predictions, train_acc, train_scores = performance(theta, X_train, y_train)
test_predictions, test_acc, test_scores = performance(theta, X_test, y_test)
val_predictions, val_acc, val_scores = performance(theta, X_val, y_val)
print("lambda = " + str(lam))
print("Train:\taccuracy=" + str(train_acc))
print("Test:\taccuracy=" + str(test_acc))
print("Validation:\taccuracy=" + str(val_acc))

lambda = 1.0
Train:	accuracy=0.7173886955478219
Test:	accuracy=0.7227289091563662
Validation:	accuracy=0.7157856842863143


### Question 2

In [75]:
tPos = 0
fPos = 0
tNeg = 0
fNeg = 0

for i in range(len(y_test)):
    if y_test[i] == True and test_predictions[i] == True:
        tPos += 1
    elif y_test[i] == False and test_predictions[i] == True:
        fPos += 1
    elif y_test[i] == False and test_predictions[i] == False:
        tNeg += 1
    else:
        fNeg += 1

print("True Positive: " + str(tPos))
print("False Positive: " + str(fPos))
print("True Negative: " + str(tNeg))
print("False Negative: " + str(fNeg))

True Positive: 9036
False Positive: 3274
True Negative: 3009
False Negative: 1347


### Question 3

In [76]:
test_predictions = list(zip(test_predictions, test_scores))
test_predictions = sorted(test_predictions, key=lambda x:x[1])


In [85]:
relevant = tPos + fNeg
retrieved = tPos + fPos

print(relevant)
print(retrieved)

10383
12310


### Question 4

### Question 5

In [116]:
### Network visualization ###

edges = set()
nodes = set()
for edge in open("egonet.txt"):
  x,y = edge.split()
  x,y = int(x),int(y)
  edges.add((x,y))
  edges.add((y,x))
  nodes.add(x)
  nodes.add(y)

G = nx.Graph()
for e in edges:
  G.add_edge(e[0],e[1])
nx.draw(G)
plt.show()
plt.clf()



AttributeError: 'set' object has no attribute 'sort'

In [106]:
connComponents = list(nx.connected_component_subgraphs(G))

In [111]:
for graph in connComponents:
    print(len(graph.nodes))

40
9
12


The graph has 3 connected components and the largest one has 40 nodes

### Question 6

In [112]:
largestCom = connComponents[0]

In [118]:
nodes

{697,
 703,
 708,
 713,
 719,
 729,
 745,
 747,
 753,
 769,
 772,
 774,
 776,
 798,
 800,
 803,
 804,
 805,
 810,
 811,
 819,
 823,
 825,
 828,
 830,
 840,
 856,
 857,
 858,
 859,
 861,
 862,
 863,
 864,
 865,
 866,
 867,
 868,
 869,
 870,
 871,
 872,
 873,
 874,
 876,
 877,
 878,
 879,
 880,
 881,
 882,
 884,
 885,
 886,
 887,
 888,
 889,
 890,
 893,
 894,
 895}

In [None]:
### Find all 3 and 4-cliques in the graph ###
cliques3 = set()
cliques4 = set()
for n1 in nodes:
  for n2 in nodes:
    if not ((n1,n2) in edges): continue
    for n3 in nodes:
      if not ((n1,n3) in edges): continue
      if not ((n2,n3) in edges): continue
      clique = [n1,n2,n3]
      clique.sort()
      cliques3.add(tuple(clique))
      for n4 in nodes:
        if not ((n1,n4) in edges): continue
        if not ((n2,n4) in edges): continue
        if not ((n3,n4) in edges): continue
        clique = [n1,n2,n3,n4]
        clique.sort()
        cliques4.add(tuple(clique))