### Importing Libraries

In [17]:
import numpy as np
import matplotlib.pyplot as plt

### Vectorizing Data Set

In [None]:
# Vectorizing our data into a numpy array!
# Format: user_id, videos_watched, forum_posts, quizzes_done, git_commits, errors_fixed
data = np.genfromtxt(
    "userdata.csv",
    delimiter=",",
    skip_header=1,
    dtype=float,
    usecols=(1, 2, 3, 4, 5)
)
print(data.shape)
print(data)

In [None]:
# Print user0's data!
print(data[0])

# Print user0's quizzes_done!
print(data[0][2])

### Calculating cosine distance

In [20]:
def cosineDist(a,b):
    return 1 - (np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b)))

print(cosineDist(data[0], data[1]))

# Question: what does cosine distance represent in this example?

0.016604602435499327


### Calculating Minkowski distance

In [None]:
def minkowskiDistance(a, b, p):
    return np.sum(np.abs(a - b) ** p) ** (1 / p)

print(minkowskiDistance(data[0], data[1], 2))

# Question: what does Euclidean distance represent in this example?

In [None]:
# Find the max Euclidean distance between user0 and every other user.
distances = []
for i in data:
    distances.append(minkowskiDistance(data[0], i, 2))
print(max(distances))

### Plotting Euclidean distance vs cosine distance

In [None]:
# let's plot User0's euclidean and cosine distance from every other user!

euclidean_vals = []
cosine_vals = []

n = len(data)
u0 = data[0]
for i in range(1, n):
    euclidean_vals.append(minkowskiDistance(u0, data[i], 2))
    cosine_vals.append(cosineDist(u0, data[i]))

euclidean_vals = np.array(euclidean_vals)
cosine_vals = np.array(cosine_vals)

plt.figure(figsize=(7, 5))
plt.scatter(euclidean_vals, cosine_vals, alpha=0.6)
plt.xlabel("Euclidean Distance from User0")
plt.ylabel("Cosine Distance from User0")
plt.title("Euclidean Distance vs Cosine Distance Between Users with User0")
plt.grid(True)
plt.show()


### Jaccard Similarity

In [None]:
# We need to binarize our data!
# Let's say: feature X is present if value >= 3

binary_data = (data >= 3).astype(int)
print(binary_data)
def jaccard_similarity(a, b):
    intersection = np.sum(a & b)
    union = np.sum(a | b)
    return intersection / union if union != 0 else 0.0

# Jaccard similarity per user w.r.t user 0
u0 = binary_data[0]

intersection = np.sum(binary_data & u0, axis=1)
union = np.sum(binary_data | u0, axis=1)

jaccard_scores = intersection / union

print(jaccard_scores)

[1.         1.         1.         0.66666667 0.75       1.
 1.         1.         0.75       1.         0.2        0.2
 0.2        0.2        0.2        0.2        0.2        0.2
 0.2        0.2        0.33333333 0.         0.33333333 0.
 0.33333333 0.         0.33333333 0.         0.         0.33333333
 0.4        0.5        0.4        0.4        0.4        0.66666667
 0.4        0.5        0.4        0.4        1.         0.5
 1.         0.5        1.         0.5        0.75       0.66666667
 0.75       1.         0.6        0.6        0.6        0.6
 0.6        0.6        0.6        0.6        0.6        0.6       ]
