In [None]:
!pip install gdown==3
!pip install plotly

In [None]:
import gdown
# https://drive.google.com/file/d/1lGeT7uzyLF1a5nL3rFuKPaSSP3ZIDAKK/view?usp=sharing
url = 'https://drive.google.com/uc?id=1lGeT7uzyLF1a5nL3rFuKPaSSP3ZIDAKK'
output = 'merged_data.csv'
gdown.download(url, output, quiet=False)

In [None]:

import glob
import warnings
import numpy as np 
import pandas as pd
import plotly as py
import seaborn as sns
import statistics as stat
import plotly.express as px
import plotly.graph_objs as go
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#@title Get Data
# !gdown "https://drive.google.com/file/d/1QGYjTxrsRwMjQ73R_k02E7rrN5YCWT0x/view?usp=sharing"
import gdown

url = 'https://drive.google.com/uc?id=1QGYjTxrsRwMjQ73R_k02E7rrN5YCWT0x'
output = 'data.zip'
gdown.download(url, output, quiet=False)

In [None]:
!unzip 'data.zip'

In [None]:
!pip install causalnex

In [None]:
from sklearn.preprocessing import LabelEncoder

def check_numeric(df: pd.DataFrame) -> list:
    """[summary]

    Args:
        df (pd.DataFrame): Dataframe to be checked for non-numeric value

    Returns:
        struct_data (pd.DataFrame): Copied DataFrame
        non_numeric columns (list): Returns list of non numeric columns
    """
    struct_data = df.copy()
 
    non_numeric_columns = list(struct_data.select_dtypes(exclude=[np.number]).columns)
    print(non_numeric_columns)
    return struct_data, non_numeric_columns


def label_encode(struct_data: pd.DataFrame, non_numeric_columns: list) -> pd.DataFrame:
    """Label encodes DataFrame

    Args:
        struct_data (pd.DataFrame): DataFrame to be encoded
        non_numeric_columns (list): list containing the numeric columns in DataFrame

    Returns:
        pd.DataFrame: Label encoded DataFrame
    """
    le = LabelEncoder()
    for col in non_numeric_columns:
        struct_data[col] = le.fit_transform(struct_data[col])
    return struct_data


In [None]:
df, non_numeric_cols = check_numeric(district_engagement)
df = label_encode(df, non_numeric_cols)

In [None]:
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE
from causalnex.structure import notears
from causalnex.structure.notears import from_pandas, from_pandas_lasso

def construct_structural_model(df:pd.DataFrame, notears=from_pandas_lasso, tabu_parent_nodes=None)-> notears:
    """Constructs structural model to be used to draw causal graph

    Args:
        df (pd.DataFrame): Preprocessed DataFrame that will construct structural model
        notears ([type], optional): [description]. Defaults to from_pandas_lasso.
        tabu_parent_nodes (list) : List of features to not be the causes

    Returns:
        notears: structural model to draw graph
    """
    structural_model = notears(df, beta=0.8,w_threshold=1.0, tabu_parent_nodes=tabu_parent_nodes)
    return structural_model


def draw_graph(structural_model: from_pandas_lasso, path, prog="dot"):
    """Draws Causal graph

    Args:
        structural_model (from_pandas_lasso): Structural model of causalnex
        prog (str, optional): Graphics tool to draw pygraphiz graph. Defaults to "dot".

    Returns:
        image (png) : Causal graph img
    """
    viz = plot_structure(
    structural_model,
    graph_attributes={"scale": "2", "size": "2.5"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK)
    img = Image(viz.draw(format='png'))

    # TODO convert print log to use logger
    print("writing graph image")
    with open(f"{path}", "wb") as png:
        png.write(img.data)

    return img

In [None]:
from IPython.display import Image
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE
from causalnex.structure.notears import from_pandas, from_pandas_lasso
import pygraphviz

In [None]:
merged=pd.read_csv("merged_data.csv")

In [None]:
encoded, non_numeric_cols = check_numeric(merged)
merged = label_encode(encoded, non_numeric_cols)

In [None]:
merged.head()

In [None]:
merged.shape
list(merged.columns)

In [None]:
select=merged[['engagement_index','pct_black/hispanic','pct_free/reduced','pp_total_raw','pct_access','Sector(s)','primary_function_main','primary_function_sub','Provider/Company Name','locale']].copy()

In [None]:
select.head()

In [None]:
encoded, non_numeric_cols = check_numeric(select)
encoded= label_encode(encoded, non_numeric_cols)

In [None]:
model_df=select[:3000000]
model_df = model_df[model_df['engagement_index'].notnull()]
model_df.isnull().sum()

In [None]:
sm_1000 = from_pandas(model_df.iloc[:1000,:], w_threshold=0.8)
viz = plot_structure(
    sm_1000,
    graph_attributes={"scale": "2.0", 'size':2.5},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK)
Image(viz.draw(format='png'))

In [None]:
! apt install python3-dev graphviz libgraphviz-dev pkg-config -y

In [None]:
!pip install pygraphviz

In [None]:
model_df.head()