This notebook uses internet to install pyspark

Advantage of pyspark: We can load all csv file at once and use sql style query. This will enable us to easily filter the data and efficient from the point of view of updation.  

Package Documentation: https://spark.apache.org/docs/latest/api/python/index.html

In [None]:
!pip install -q pyspark
!apt-get install -y openjdk-11-jdk-headless -qq > /dev/null

In [None]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [None]:
import numpy as np
import pandas as pd

import time
import os

from tqdm.notebook import tqdm

In [None]:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

!java -version

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, IntegerType, DoubleType, StructField, StructType, DateType
from pyspark.sql.functions import array, col, explode, lit, create_map
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import pandas_udf, PandasUDFType

from itertools import chain

In [None]:
def count_na(data, normalize=False):
    if normalize:
        count = data.count()
        return data.select([(F.count(F.when(F.isnan(c) | F.col(c).isNull() if t not in ('date', 'timestamp') else F.col(c).isNull(), c))/count).alias(c) for c, t in data.dtypes])
    return data.select([(F.count(F.when(F.isnan(c) | F.col(c).isNull() if t not in ('date', 'timestamp') else F.col(c).isNull(), c))).alias(c) for c, t in data.dtypes])

In [None]:
# Printing few file name
for dirname, _, filenames in os.walk('/kaggle/input/DailyNAV'):
    for filename in filenames[:10]:
        print(os.path.join(dirname, filename))

In [None]:
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
sc

## Loading Dataset

In [None]:
data_schema = StructType(
    [StructField('Scheme_Code', IntegerType(), False), 
     StructField('Scheme_Name', StringType(), False), 
     StructField('ISIN Div Payout/ISIN Growth', StringType(), False),
     StructField('ISIN Div Reinvestment', StringType(), False),
     StructField('NAV', DoubleType(), False),
     StructField('Repurchase Price', DoubleType(), False), 
     StructField('Sale Price', DoubleType(), False), 
     StructField('Date', DateType(), False)]
)

data = spark.read.csv('/kaggle/input/DailyNAV', header = True, schema=data_schema)
data.cache()
data.printSchema()

In [None]:
data.rdd.getNumPartitions()

In [None]:
data.show()

### Validating All Days

In [None]:
%%time
unique_date = data.select('Date').distinct().orderBy('Date').toPandas()['Date']
unique_date = pd.to_datetime(unique_date)
unique_date

In [None]:
min_date = unique_date.min()
max_date = unique_date.max()

date_range = pd.date_range(min_date, max_date, freq='1D')
assert set(date_range) == set(unique_date)

min_date = min_date.strftime("%Y-%m-%d")
max_date = max_date.strftime("%Y-%m-%d")

print(f'Data contains all days between {min_date} and {max_date}')

### Filter for a date

In [None]:
%%time
data_single_day = data.where('Date == "2020-12-05"')
data_single_day.show()

### Filter on Latest Data

In [None]:
%%time
latest_date = data.selectExpr('max(Date) as Date').collect()[0]['Date'].strftime("%Y-%m-%d")
print(f'latest_date: {latest_date}')

latest_data = data.where(f'Date == "{latest_date}"')
latest_data.show()

### Filter for a Mutual Fund

In [None]:
%%time
axis_long_eq = data.where('Scheme_Name == "Axis Long Term Equity Fund - Direct Plan - Growth Option"').orderBy('Date')
axis_long_eq.show()

### Convert into Pandas

CAUTION: pyspark doesn't keep data in memory but pandas does. So, below is Memory intensive command

In [None]:
%%time
pandas_data = axis_long_eq.toPandas()
pandas_data['Date'] = pd.to_datetime(pandas_data['Date'])
pandas_data

In [None]:
pandas_data.info()

In [None]:
pandas_data.query('NAV == 0')

In [None]:
# Observe for one date NAV is 0. Thus we will drop this obervation
pandas_data.loc[pandas_data['NAV'] == 0, 'NAV'] = np.nan

In [None]:
pandas_data.set_index('Date')['NAV'].plot(figsize=(20, 10));