In [1]:
import csv
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

You can find all the required data [here](https://drive.google.com/drive/folders/1lzP-lq_JaVupDBBOqxL-6WYd2LkHZZ2t?usp=sharing)

In [2]:
## create this two folders
IN_DIR = "data_from_sol_22_updated"
OUT_DIR = "frctal_applied_from_sol_22"

In [3]:
## this contains the data for all the years and month to form a continous flow of data combining multiple dtaa sources like from different satelites
## the data is chosen based on number_of_missing values in a given data source.
df_data = pd.read_csv("continuous_data.csv")
df_data

Unnamed: 0,file_names,satellite,year,month,total_rows,missing_values
0,1986/g05_xrs_1m_19860101_19860131.csv,goes05,1986,1,44640,617
1,1986/g05_xrs_1m_19860201_19860228.csv,goes05,1986,2,40320,203
2,1986/g05_xrs_1m_19860301_19860331.csv,goes05,1986,3,44640,2686
3,1986/g05_xrs_1m_19860401_19860430.csv,goes05,1986,4,43200,3646
4,1986/g05_xrs_1m_19860501_19860531.csv,goes05,1986,5,44640,517
...,...,...,...,...,...,...
403,2019/g14_xrs_1m_20191001_20191031.csv,goes14,2019,10,44640,986
404,2019/g14_xrs_1m_20191101_20191130.csv,goes14,2019,11,43200,161
405,2019/g15_xrs_1m_20191201_20191231.csv,goes15,2019,12,44640,36
406,2020/g15_xrs_1m_20200101_20200131.csv,goes15,2020,1,44640,584


In [4]:
def create_folder(year: int):
    #create folder to hold the data
    os.makedirs(f"{OUT_DIR}/{year}", exist_ok = True)

In [5]:
## the logic here is to fill the data using ffill method if total missing in a file is less than or equal 50 
for index, row in df_data.iterrows():
    if row.missing_values <= 50:
        df = pd.read_csv(os.path.join(IN_DIR, row.file_names))
        create_folder(row.year)
        before_filling = (df['xl'] == -99999).sum()
        
        #ffill data where missing_value count <=50
        df.replace(-99999, pd.NA, inplace=True)
        df.fillna(method='ffill', inplace=True)
        
        after_filling = (df['xl'] == -99999).sum()
        # save updated data
        df.to_csv(os.path.join(OUT_DIR, row.file_names), index = False)
        

In [6]:
df_data[df_data['missing_values'] <= 50]

Unnamed: 0,file_names,satellite,year,month,total_rows,missing_values
28,1988/g07_xrs_1m_19880501_19880531.csv,goes07,1988,5,44640,50
34,1988/g07_xrs_1m_19881101_19881130.csv,goes07,1988,11,43200,32
40,1989/g07_xrs_1m_19890501_19890531.csv,goes07,1989,5,44640,49
41,1989/g07_xrs_1m_19890601_19890630.csv,goes07,1989,6,43200,34
58,1990/g07_xrs_1m_19901101_19901130.csv,goes07,1990,11,43200,28
...,...,...,...,...,...,...
381,2017/g15_xrs_1m_20171201_20171231.csv,goes15,2017,12,44640,44
386,2018/g14_xrs_1m_20180501_20180531.csv,goes14,2018,5,44640,46
393,2018/g14_xrs_1m_20181201_20181231.csv,goes14,2018,12,44640,32
400,2019/g14_xrs_1m_20190701_20190731.csv,goes14,2019,7,44640,40


In [7]:
# it holds the data which files  missing values exceeds more than 50 rows
df_missing_info = df_data[df_data['missing_values'] > 50].reset_index(drop = True)
df_missing_info

Unnamed: 0,file_names,satellite,year,month,total_rows,missing_values
0,1986/g05_xrs_1m_19860101_19860131.csv,goes05,1986,1,44640,617
1,1986/g05_xrs_1m_19860201_19860228.csv,goes05,1986,2,40320,203
2,1986/g05_xrs_1m_19860301_19860331.csv,goes05,1986,3,44640,2686
3,1986/g05_xrs_1m_19860401_19860430.csv,goes05,1986,4,43200,3646
4,1986/g05_xrs_1m_19860501_19860531.csv,goes05,1986,5,44640,517
...,...,...,...,...,...,...
335,2019/g15_xrs_1m_20190901_20190930.csv,goes15,2019,9,43200,1962
336,2019/g14_xrs_1m_20191001_20191031.csv,goes14,2019,10,44640,986
337,2019/g14_xrs_1m_20191101_20191130.csv,goes14,2019,11,43200,161
338,2020/g15_xrs_1m_20200101_20200131.csv,goes15,2020,1,44640,584


In [8]:
import numpy as np
import scipy, random 
import matplotlib.pyplot as plt
def d( x ):
    # expects an enumerable, subtracts the right endpoint from the left
    return float(x[-1]-x[0])


def an( x, i ):
    # the (0,0) element in the rotation matrix of the iterated function system (IFS)
    return ( x[i] - x[i-1] )/d(x)


def dn( x, i ):
    # the (0) element in the translation vector of the IFS
    return ( x[-1]*x[i-1] - x[0]*x[i] )/d(x)


def cn( x, y, i, sn ):
    # the (1,0) element in the rotation matrix of the IFS
    return ( y[i] - y[i-1] )/d(x) - sn*( y[-1] - y[0] )/d(x)


def en( x, y, i, sn ):
    # the (1) element in the translation vector of the IFS
    return ( x[-1]*y[i-1] - x[0]*y[i])/d(x) - sn*( x[-1]*y[0] - x[0]*y[-1] )/d(x)

def Wn( X, U, i, sn ):
    '''
    the iterated function sytem
      R is the rotation matrix
      T is the translation vector
    computes
      R*X + T
    '''
    # rotation matrix
    R = np.matrix([[ an(U[:,0],i), 0 ],\
                   [ cn(U[:,0],U[:,1],i,sn), sn ]])
    # transalation vector
    T = np.matrix([[ dn(U[:,0],i) ],\
                   [ en(U[:,0],U[:,1],i,sn) ]])
    # calculate R*X + T
    tmp = R * np.matrix(X).T + T
    # return the new points
    xp, yp = np.array( tmp.T )[0]
    return xp, yp
    
def FIF( U, nth, sn, balance=False ):
    # the fractal interpolating function
    X = U.copy()
    x, y = list( X[:,0] ), list( X[:,1] )
    M = U.shape[0]

    for i in range(nth):
        # call an IFS for each segment
        for j in range( 1,M ):
            xp, yp = Wn( X[i], U, j, sn )
            x.append( xp )
            y.append( yp )
            if balance:
                xp, yp = Wn( X[i], U, j, -sn )
                x.append( xp )
                y.append( yp )
    x = np.array(x)
    y = np.array(y)
    # this puts the interpolated
    # data points at the bottom of X
    X = np.vstack((x,y)).T
    X = X[ X[:,0].argsort() ]
    # these two lines rearrage X so that the interpolated
    # data points are between the original data points
    null, indices = np.unique( X[:,0], return_index=True )
    X = X[ indices ]
    return X


In [9]:
def select_nth_values_and_build_df(df_with_missing, nth = 10):
    df_with_missing["xs_copy"] = df_with_missing["xs"]
    df_with_missing["xl_copy"] = df_with_missing["xl"]

    # Get the last row of the DataFrame
    last_row = df_with_missing.iloc[-1]

    # Create a new index for the new row
    new_index = len(df_with_missing)

    # Add the new row to the DataFrame with a different index
    # this extra row helps to keep the same length after the interpolation
    df_with_missing.loc[new_index] = last_row

    # bfill and ffill all the missing rows.
    # the idea here is to interpolate data on all the filled values and 
    # then take only the missing values in the final df
    df_with_missing['xs_copy'] = df_with_missing['xs_copy'].replace(-99999, method='bfill')
    df_with_missing['xl_copy'] = df_with_missing['xl_copy'].replace(-99999, method='bfill')
    df_with_missing['xs_copy'] = df_with_missing['xs_copy'].replace(-99999, method='ffill')
    df_with_missing['xl_copy'] = df_with_missing['xl_copy'].replace(-99999, method='ffill')

    ## there is also some data that are corrupted and contains values 32700
    df_with_missing['xs_copy'] = df_with_missing['xs_copy'].replace(32700, method='bfill')
    df_with_missing['xl_copy'] = df_with_missing['xl_copy'].replace(32700, method='bfill')
    df_with_missing['xs_copy'] = df_with_missing['xs_copy'].replace(32700, method='ffill')
    df_with_missing['xl_copy'] = df_with_missing['xl_copy'].replace(32700, method='ffill')

    df_nth_taken = df_with_missing[::nth].reset_index(drop = True)
    return df_nth_taken

In [10]:
def apply_interpolation_and_get_data(df_nth_taken, df_with_missing, nth):
    xs = df_nth_taken["xs_copy"].tolist()
    xl = df_nth_taken["xl_copy"].tolist()
    indexs = df_nth_taken.index.tolist()

    np_xs = np.array(xs)
    np_xl = np.array(xl)
    np_indexs = np.array(indexs)

    stacked_xs = np.vstack((np_indexs, np_xs)).T
    stacked_xl = np.vstack((np_indexs, np_xl)).T

    ## 0.01 is our vertical scaling factor. You can control it to get new interpolated data
    interpolated_xs = FIF(stacked_xs, nth, 0.01)
    interpolated_xl = FIF(stacked_xl, nth, 0.01)

    df_with_missing["xs_interpolated"] = interpolated_xs[:, 1]
    df_with_missing["xl_interpolated"] = interpolated_xl[:, 1]

    ## filtering out the rows based on the logic like:
    ## if the original value is presented them we are taking them
    ## if it is not presented them we will take newly interpolated values.
    df_with_missing['xs'] = df_with_missing.apply(lambda row: row['xs_interpolated'] if row['xs'] == -99999 else row['xs'], axis=1)
    df_with_missing['xl'] = df_with_missing.apply(lambda row: row['xl_interpolated'] if row['xl'] == -99999 else row['xs'], axis=1)

    df_interpolated = df_with_missing[["time_tag", "xs", "xl"]]
    
    return df_interpolated

In [11]:
file_names = df_missing_info["file_names"].tolist()
for i, file_name in enumerate(file_names):
    year = file_name.split("/")[0]
    create_folder(year)
    print("path:", os.path.join(IN_DIR, file_name))
    df_with_missing = pd.read_csv(os.path.join(IN_DIR, file_name))

    ## Here the idea is to take each tenth values of the given file
    ## and then interpolate the in between data and after that filted them
    df_nth_taken = select_nth_values_and_build_df(df_with_missing, nth = 10)
    
    df_interpolated = apply_interpolation_and_get_data(df_nth_taken, df_with_missing, nth = 10)
    
    ## dropping it as we have added it extra in our process
    df_interpolated = df_interpolated.drop(df_interpolated.index[-1])
    
    df_interpolated.to_csv(os.path.join(OUT_DIR, file_name), index = False)

path: data_from_sol_22_updated/1986/g05_xrs_1m_19860101_19860131.csv
path: data_from_sol_22_updated/1986/g05_xrs_1m_19860201_19860228.csv
path: data_from_sol_22_updated/1986/g05_xrs_1m_19860301_19860331.csv
path: data_from_sol_22_updated/1986/g05_xrs_1m_19860401_19860430.csv
path: data_from_sol_22_updated/1986/g05_xrs_1m_19860501_19860531.csv
path: data_from_sol_22_updated/1986/g05_xrs_1m_19860601_19860630.csv
path: data_from_sol_22_updated/1986/g05_xrs_1m_19860701_19860731.csv
path: data_from_sol_22_updated/1986/g05_xrs_1m_19860801_19860831.csv
path: data_from_sol_22_updated/1986/g05_xrs_1m_19860901_19860930.csv
path: data_from_sol_22_updated/1986/g05_xrs_1m_19861001_19861031.csv
path: data_from_sol_22_updated/1986/g05_xrs_1m_19861101_19861130.csv
path: data_from_sol_22_updated/1986/g05_xrs_1m_19861201_19861231.csv
path: data_from_sol_22_updated/1987/g06_xrs_1m_19870101_19870131.csv
path: data_from_sol_22_updated/1987/g06_xrs_1m_19870201_19870228.csv
path: data_from_sol_22_updated/198

## Simple data quality checks

In [12]:
##  checking if there is any more missing values in our data
file_names = df_data["file_names"].tolist()
for i, file_name in enumerate(file_names):
    df = pd.read_csv(os.path.join(OUT_DIR, file_name))
    s = (df['xl'] == -99999).sum()
    print(file_name, s)

1986/g05_xrs_1m_19860101_19860131.csv 0
1986/g05_xrs_1m_19860201_19860228.csv 0
1986/g05_xrs_1m_19860301_19860331.csv 0
1986/g05_xrs_1m_19860401_19860430.csv 0
1986/g05_xrs_1m_19860501_19860531.csv 0
1986/g05_xrs_1m_19860601_19860630.csv 0
1986/g05_xrs_1m_19860701_19860731.csv 0
1986/g05_xrs_1m_19860801_19860831.csv 0
1986/g05_xrs_1m_19860901_19860930.csv 0
1986/g05_xrs_1m_19861001_19861031.csv 0
1986/g05_xrs_1m_19861101_19861130.csv 0
1986/g05_xrs_1m_19861201_19861231.csv 0
1987/g06_xrs_1m_19870101_19870131.csv 0
1987/g06_xrs_1m_19870201_19870228.csv 0
1987/g06_xrs_1m_19870301_19870331.csv 0
1987/g07_xrs_1m_19870401_19870430.csv 0
1987/g07_xrs_1m_19870501_19870531.csv 0
1987/g07_xrs_1m_19870601_19870630.csv 0
1987/g07_xrs_1m_19870701_19870731.csv 0
1987/g07_xrs_1m_19870801_19870831.csv 0
1987/g06_xrs_1m_19870901_19870930.csv 0
1987/g07_xrs_1m_19871001_19871031.csv 0
1987/g07_xrs_1m_19871101_19871130.csv 0
1987/g07_xrs_1m_19871201_19871231.csv 0
1988/g07_xrs_1m_19880101_19880131.csv 0


In [13]:
## merge all dataframe to get a continuous data file
dataframes = []
for file_name in file_names:
    df = pd.read_csv(os.path.join(OUT_DIR, file_name))
    dataframes.append(df)

# Concatenate the list of DataFrames into one DataFrame
concatenated_df = pd.concat(dataframes, ignore_index=True)
concatenated_df.to_csv("sol_22_23_24_data_no_missing_v2.0.csv", index = False)

In [14]:
concatenated_df.describe()

Unnamed: 0,xs,xl
count,17879040.0,17879040.0
mean,5.574662,5.574662
std,426.9196,426.9196
min,-1.157632e-08,-1.131665e-08
25%,1.16e-09,1.55e-09
50%,4.6469e-09,6.89e-09
75%,1.69e-08,3.56e-08
max,32700.0,32700.0


In [16]:
## As there is a buf in the data that is why it is showing the max value 32700
max_value = concatenated_df['xs'].max()
min_value = concatenated_df['xs'].min()

print(f"Max value of column xs: {max_value}")
print(f"Min value of column xs: {min_value}")

Max value of column xs: 32700.0
Min value of column xs: -1.1576319354838711e-08
