### Question - 1
A Social Media Influencer collected data on Facebook friend requests and used
a supervised algorithm to predict whether a user would accept a friend request or
not. Dataset This is the Dataset You can use this dataset for this question. Note : Use
only Dask and Use MLflow

In [1]:
import dask.dataframe as dd
from dask_ml.model_selection import train_test_split
from dask_ml.linear_model import LogisticRegression
import mlflow
import mlflow.sklearn
import networkx as nx

In [2]:
import zipfile

# Unzip the file
with zipfile.ZipFile("/content/drive/MyDrive/ML/ADVANCE/ADV_ML_Q1/train.csv.zip", "r") as zip_ref:
    zip_ref.extractall("/content/")


In [3]:
mlflow.start_run(nested=True)

<ActiveRun: >

In [4]:
df = dd.read_csv("/content/train.csv")

In [5]:
df.head()

Unnamed: 0,source_node,destination_node
0,1,690569
1,1,315892
2,1,189226
3,2,834328
4,2,1615927


In [6]:
# Convert the source and destination nodes to Dask arrays
source_nodes = df['source_node'].to_dask_array(lengths=True)
destination_nodes = df['destination_node'].to_dask_array(lengths=True)

# Create a directed graph using NetworkX
G = nx.DiGraph()
edges = [(source, destination) for source, destination in zip(source_nodes.compute(), destination_nodes.compute())]
G.add_edges_from(edges)

In [7]:
# Calculate Common Neighbors
def common_neighbors_score(G, u, v):
    common_neighbors = set(G.successors(u)).intersection(G.successors(v))
    return len(common_neighbors)

In [8]:
# Calculate Jaccard's Coefficient
def jaccard_coefficient_score(G, u, v):
    u_neighbors = set(G.successors(u))
    v_neighbors = set(G.successors(v))
    intersection = u_neighbors.intersection(v_neighbors)
    union = u_neighbors.union(v_neighbors)
    return len(intersection) / len(union)


In [9]:
# Calculate Preferential Attachment
def preferential_attachment_score(G, u, v):
    u_neighbors = set(G.successors(u))
    v_neighbors = set(G.successors(v))
    return len(u_neighbors) * len(v_neighbors)

In [None]:
# Calculate scores for all pairs of nodes
for u, v in G.edges():
    common_neighbors = common_neighbors_score(G, u, v)
    jaccard_coefficient = jaccard_coefficient_score(G, u, v)
    preferential_attachment = preferential_attachment_score(G, u, v)
    
    # Log scores and other relevant information using MLflow
    with mlflow.start_run(nested=True):
        mlflow.log_param("Source Node", u)
        mlflow.log_param("Destination Node", v)
        mlflow.log_metric("Common Neighbors Score", common_neighbors)
        mlflow.log_metric("Preferential Attachment Score", preferential_attachment)
    
    print(f"Nodes {u} and {v}:")
    print(f"Common Neighbors Score: {common_neighbors}")
    print(f"Jaccard's Coefficient Score: {jaccard_coefficient}")
    print(f"Preferential Attachment Score: {preferential_attachment}")
    print("-------------")

# End the MLflow run
mlflow.end_run()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Nodes 946366 and 84:
Common Neighbors Score: 1
Jaccard's Coefficient Score: 0.023255813953488372
Preferential Attachment Score: 483
-------------
Nodes 946366 and 1572098:
Common Neighbors Score: 5
Jaccard's Coefficient Score: 0.15151515151515152
Preferential Attachment Score: 345
-------------
Nodes 946366 and 995833:
Common Neighbors Score: 1
Jaccard's Coefficient Score: 0.03571428571428571
Preferential Attachment Score: 138
-------------
Nodes 946366 and 1117552:
Common Neighbors Score: 1
Jaccard's Coefficient Score: 0.03225806451612903
Preferential Attachment Score: 207
-------------
Nodes 946366 and 1202832:
Common Neighbors Score: 2
Jaccard's Coefficient Score: 0.07142857142857142
Preferential Attachment Score: 161
-------------
Nodes 164496 and 20582:
Common Neighbors Score: 0
Jaccard's Coefficient Score: 0.0
Preferential Attachment Score: 72
-------------
Nodes 164496 and 913164:
Common Neighbors Score: 0
Jaccard'