# Cleaning Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import random
import pickle
import seaborn as sns

In [None]:
clean = pd.read_csv('data.csv')

In [None]:
clean.head()

In [None]:
clean.info()

In [None]:
most_common_symbol = clean['symbol'].value_counts()
most_common_symbol

In [None]:
user_choice1 = input(f"Enter the company stock you would like to check: ")

In [None]:
clean = clean[clean['symbol'] == user_choice1]
clean

In [None]:
# duplicates in f_date
duplicates = clean[clean.duplicated(subset='f_date', keep=False)]

# drop duplicates on f_date
clean = clean.drop_duplicates(subset='f_date')

In [None]:
clean.loc[:, 'f_date'] = pd.to_datetime(clean['f_date'])
clean['f_date'].info()
clean.head()

In [None]:
clean = clean.set_index('f_date')
clean.index = pd.to_datetime(clean.index)
all_dates = pd.date_range(start=clean.index.min(), end=clean.index.max(), freq='D')
clean = clean.reindex(all_dates)
clean = clean.infer_objects() # called, but still future warning is shown
clean

In [None]:
clean = clean.drop(["Unnamed: 0", "sector", "percent_change"], axis = 1)

In [None]:
clean.head()

In [None]:
clean['avg'] = clean[['open', 'high', 'low', 'close']].mean(axis = 1)
clean.head()

In [None]:
clean = clean.ffill()
clean.head()

In [None]:
clean.reset_index(inplace=True)

In [None]:
clean.head()

In [None]:
clean.rename(columns={'index': 'f_date'}, inplace=True)
clean

# Creation of Visibility Graph

In [None]:
rows = []

window_size = int(input("Enter the sliding window days: "))  # use a any-day window for demonstration
for i in range(len(clean) - window_size):
    
    # Extract 'avg' features
    avg_features = clean['avg'].iloc[i:i+window_size].values
    
    # Extract 'volume' features
    volume_features = clean['volume'].iloc[i:i+window_size].values

    # Extract 'open' features
    open_features = clean['open'].iloc[i:i+window_size].values

    # Extract 'high' features
    high_features = clean['high'].iloc[i:i+window_size].values

    # Extract 'low' features
    low_features = clean['low'].iloc[i:i+window_size].values
    
    # Extract 'close' features
    close_features = clean['close'].iloc[i:i+window_size].values

    # Define the target as the 'avg' value on the future day
    # target = clean['avg'].iloc[i + window_size]

    # Combine the features and target
    rows.append(list(avg_features) + list(volume_features) + list(open_features) + 
                list(high_features)+ list(low_features) + list(close_features))

In [None]:
rows

In [None]:
columns = [f'avg_{i+1}' for i in range(window_size)] + [f'volume_{i+1}' for i in range(window_size)] + [f'open_{i+1}' for i in range(window_size)] + [f'high_{i+1}' for i in range(window_size)] + [f'low_{i+1}' for i in range(window_size)] + [f'close_{i+1}' for i in range(window_size)]
data = pd.DataFrame(rows, columns = columns)
print(data)

In [None]:
data['volume_2'] # verification of existence

In [None]:
data.head()

In [None]:
column_prefix = input("Enter the field you would like to explore:\n close, open, high, avg, volume, low: \n")
column_names = [f'{column_prefix}_{i}' for i in range(1, window_size + 1)]
print(data[column_names])

In [None]:
from tqdm import tqdm  # for progress tracking
data[column_names]

In [None]:
data[column_names].info()

In [None]:
column_names

In [None]:
data[column_names].values

In [None]:
# function to create visibility graph from a larger window

def for_graph(row):
    selected_for_now = row[column_names].values
    G = nx.visibility_graph(selected_for_now)

    # mapping = {i: val for i, val in enumerate(selected_for_now)} # provides values of rows instead of using indices
    # G = nx.relabel_nodes(G, mapping) # mapping indices to rows
    # G.remove_edges_from(nx.selfloop_edges(G)) # removing self loops as they provide no info, maybe noise
    
    return G

jati_lekhdaa_ni_huncha = 100

In [None]:
# function to process data in batches to avoid overwhelming memory (i have only 8GB RAM and 128MB VRAM)

def process_in_batches(df, batch_size = jati_lekhdaa_ni_huncha):
    
    # an empty list to store visibility graphs
    visibility_graphs = []
    
    # process the DataFrame in batches
    num_batches = len(df) // batch_size + 1 # number of batches given in integer
    
    for i in tqdm(range(num_batches)):
        batch_df = df.iloc[i * batch_size : (i + 1) * batch_size]
        
        # generate graphs for the current batch
        batch_graphs = [for_graph(row) for _, row in batch_df.iterrows()]
        visibility_graphs.extend(batch_graphs)
    
    return visibility_graphs

In [None]:
# generating visibility graphs for the entire dataset (in batches)
visibility_graphs = process_in_batches(data, batch_size = jati_lekhdaa_ni_huncha)

In [None]:
# save visibility graphs for future use (serialization)
with open('visibility_graphs.pkl', 'wb') as f:
    pickle.dump(visibility_graphs, f)

In [None]:
len(visibility_graphs)

In [None]:
visibility_graphs

In [None]:
sampled_graphs = visibility_graphs[:5] # this is to print the graphs serially, right now it prints from 0 to 4
# to plot the sampled graphs (all of them: [:5] replaced by "len(visibility_graphs)"))

for i, G in enumerate(sampled_graphs):
    
    pos = nx.circular_layout(G)
    # pos = nx.shell_layout(G)
    # pos = nx.spring_layout(G, k = 0.8, iterations = 50)
        
    layout_params = {"pos": {j: (j, v) for j, v in enumerate(G.nodes())}}
    
    plt.figure(figsize=(8, 8))
    plt.xlabel("Time", fontsize = 14)
    plt.ylabel("Value", fontsize = 14)
    plt.margins(0.20)

    nx.draw_networkx_nodes(G, pos, alpha = 0.3, node_size = 300, node_color = 'green')
    nx.draw_networkx_labels(G, pos, font_size = 10, font_color = 'black', font_weight = 'normal')
    nx.draw_networkx_edges(G, pos, alpha = 0.6, width = 1, edge_color='blue', style = 'solid', arrows = True, arrowstyle = '-')

    plt.title(f"Visibility Graph {i}", fontsize = 12)
    plt.axis("equal")
    plt.tight_layout()
    plt.show()
    plt.close()  # close the figure after displaying it

In [None]:
labels = nx.get_node_attributes(G, "value") # last graph row's value
labels

# Classifying the Feature

In [None]:
# harek row in data[column_names] ko laagi euta graph banayeko cha
# avg_1 column maa last 20 values hudaina
# avg_20 column maa first 20 values hudaina

# tara day 1 ko average "avg_1" ley diyeko cha AND day 20 ko average "avg_20" ley diyeko cha
# first row ko day 20 ko average: avg_20 ko thyakka tala day 21 ko average: "avg_21" huncha (if you know how sliding window works)
# so aba class banaudaa day 21 ko day 20 ko comparison maa badyo ki nai ta? tyo chai label gareko
# badyo = 2, same = 1, ghatyo = 0
# ani tei anusaar, row 0 (yes 0, first row) by the value of day_21 chai will be labeled as 2, 1, or 0 ani tyo row tyo class ma paryo
# either badney class maa, equal class maa, or ghatney class maa

In [None]:
work_on = data[column_names]
work_on

In [None]:
work_on.loc[:, ['class']] = 0  # initialize the 'class' column with 0
work_on.head()

In [None]:
work = work_on.columns[-2]
work_on[work]

In [None]:
work_on.loc[1, work]

In [None]:
for i in range(len(work_on) - 1):
    if work_on.loc[i, work] == work_on.loc[i + 1, work]:
        work_on.loc[i, 'class'] = 1
    elif work_on.loc[i, work] > work_on.loc[i + 1, work]:
        work_on.loc[i, 'class'] = 2

In [None]:
work_on

In [None]:
verify_gara = work_on[[work, 'class']]
verify_gara

In [None]:
verify_gara['class'].value_counts()

In [None]:
# the goal here is to associate each visibility graph with its respective class (0, 1 or 2) and store them together.
graph_dataset = [(G, label) for G, label in zip(visibility_graphs, work_on['class'])] # graph = G ani data['class'] ko values sanga zipped
graph_dataset

In [None]:
# repeat: saving the graph dataset to a file but different name: not coincinding with previous save file
with open('graph_dataset_class_labeled.pkl', 'wb') as f: # wb bhannaley, write binary file: f
    pickle.dump(graph_dataset, f) 

In [None]:
# if you want to load the dataset for further analysis/work/reading/anything use:
with open('graph_dataset_class_labeled.pkl', 'rb') as f: # rb bhannaley read binary file: f
    graph_dataset = pickle.load(f)

In [None]:
# check if the recently loaded pickle file of graph database is legit
for i in range(5):
    graph, label = graph_dataset[i]
    print(f"Graph {i+1}: Class Label = {label}")
    print(f"Number of Nodes: {graph.number_of_nodes()}")
    print(f"Number of Edges: {graph.number_of_edges()}\n")

# Statistical Analysis

In [None]:
def extract_graph_features(graph):
    nodes = graph.number_of_nodes()
    edges = graph.number_of_edges()
    diameter = nx.diameter(graph) if nodes > 1 else 0

    center = nx.center(graph) if nodes > 1 else []
    radius = nx.radius(graph) if nodes > 1 else 0
    
    return {
        '#nodes': nodes,
        '#edges': edges,
        'diameter': diameter,
        'center': center,
        'radius': radius
    }

In [None]:
feature_data = []

for graph, label in graph_dataset:
    features = extract_graph_features(graph)
    features['class'] = label  # add the class as label
    feature_data.append(features)

# now in dataframe
features_df = pd.DataFrame(feature_data)
features_df

In [None]:
plt.figure(figsize=(10, 10))
sns.kdeplot(features_df[features_df['class'] == 0]['radius'], label='Class 0', fill=False, color='blue', alpha=0.9)
sns.kdeplot(features_df[features_df['class'] == 1]['radius'], label='Class 1', fill=False, color='green', alpha=0.9)
sns.kdeplot(features_df[features_df['class'] == 2]['radius'], label='Class 2', fill=False, color='red', alpha=0.9)

plt.title('Distribution of Radius by Class')
plt.xlabel('Radius')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.savefig('Distribution of Radius by Class', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
plt.figure(figsize=(10, 10))
sns.kdeplot(features_df[features_df['class'] == 0]['diameter'], label='Class 0', fill=False, color='blue', alpha=0.9)
sns.kdeplot(features_df[features_df['class'] == 1]['diameter'], label='Class 1', fill=False, color='green', alpha=0.9)
sns.kdeplot(features_df[features_df['class'] == 2]['diameter'], label='Class 2', fill=False, color='red', alpha=0.9)

plt.title('Distribution of Diameter by Class')
plt.xlabel('Diameter')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.savefig('Distribution of Diameter by Class', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
number_of_centers = []

for i in features_df['center']:
    print(len(i))
    number_of_centers.append(len(i))

In [None]:
features_df['number_of_centers'] = number_of_centers
features_df.head()

In [None]:
plt.figure(figsize=(10, 10))
sns.kdeplot(features_df[features_df['class'] == 0]['number_of_centers'], label='Class 0', fill=False, color='blue', alpha=0.9)
sns.kdeplot(features_df[features_df['class'] == 1]['number_of_centers'], label='Class 1', fill=False, color='green', alpha=0.9)
sns.kdeplot(features_df[features_df['class'] == 2]['number_of_centers'], label='Class 2', fill=False, color='red', alpha=0.9)

plt.title('Distribution of Number of Centers by Class')
plt.xlabel('Number of Centers')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.savefig('Distribution of Number of Centers by Class', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
plt.figure(figsize=(10, 10))
sns.kdeplot(features_df[features_df['class'] == 0]['#edges'], label='Class 0', fill=False, color='blue', alpha=0.9)
sns.kdeplot(features_df[features_df['class'] == 1]['#edges'], label='Class 1', fill=False, color='green', alpha=0.9)
sns.kdeplot(features_df[features_df['class'] == 2]['#edges'], label='Class 2', fill=False, color='red', alpha=0.9)

plt.title('Distribution of Number of Edges by Class')
plt.xlabel('Number of Edges')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.savefig('Distribution of Number of Edges by Class', dpi=300, bbox_inches='tight')
plt.show()