In [None]:
import matplotlib.pyplot as plt
import numpy as np
import glob
import os
import datetime
import time

cross_val = 10

#### Select all file names from the los folder. This folder contains all the rescaled images

In [None]:
ALL_FILES = np.array([])
ALL_FILES = glob.glob("../shared/Data/HMI_LOS_SHARPS/valid_magnetograms/los/*.dat")
len(ALL_FILES)

#### Select all active region names. Check the number of active regions

In [None]:
ACTIVE = np.array([], dtype=np.int8)
REST = np.array([])

for F in ALL_FILES:
    
    """extract the file name from the path name"""
    FILE_NAME = os.path.split(F)[-1]
    
    """position of the first _"""
    start = FILE_NAME.index('_')
    rest = FILE_NAME[start + 1:]
    REST = np.append(REST, rest)
    
    
    ACTIVE_REGION = FILE_NAME[0:start]
    ACTIVE = np.append(ACTIVE, ACTIVE_REGION)

"""Select the unique active region names"""    
UNIQUE_ACTIVE = np.unique(ACTIVE)
print(len(UNIQUE_ACTIVE)) # number of active regions

#### Within an active region, select only those file names which have six hours time difference

In [None]:
ALL_PATHS = np.array([]) # all file paths with six hours time difference

"""
Iterate over each active region
"""

for i in range(len(UNIQUE_ACTIVE)):
    
    """All file paths for a given active region"""
    PATHS = glob.glob("../shared/Data/HMI_LOS_SHARPS/valid_magnetograms/los/%s_*.dat" %UNIQUE_ACTIVE[i])
    
    FILE_NAME = os.path.split(PATHS[0])[-1]
    
    """first _ position"""
    start = FILE_NAME.index('_')
    
    """last _ position"""
    end = FILE_NAME.index('_', start+1)
    
    TIMESTAMP = FILE_NAME[start+1:end]
    
    """Timestamp of the first file in the i-th active region"""
    TIME0 = datetime.datetime.strptime(TIMESTAMP,'%Y%m%dT%H%M')
    dTS0 = time.mktime(TIME0.timetuple())
    ALL_PATHS = np.append(ALL_PATHS, PATHS[0])
    
    """
    Iterate over the rest of the files in the i-th active region 
    """
    
    for j in range(1, len(PATHS)):
        FILE_NAME = os.path.split(PATHS[j])[-1]
        start = FILE_NAME.index('_')
        end = FILE_NAME.index('_', start+1)
        TIMESTAMP = FILE_NAME[start+1:end]
        TIME = datetime.datetime.strptime(TIMESTAMP,'%Y%m%dT%H%M')
        dTS = time.mktime(TIME.timetuple())
        
        """Time difference (in minutes) between the first file and this file"""
        DIFF = int(dTS0 - dTS)/60
        if (DIFF % 60 == 0):
            ALL_PATHS = np.append(ALL_PATHS, PATHS[j])

#### Check the number of active regions for the reduced file names

In [None]:
ACTIVE = np.array([], dtype=np.int8)
REST = np.array([])

for F in ALL_PATHS:
    FILE_NAME = os.path.split(F)[-1]
    start = FILE_NAME.index('_')
    rest = FILE_NAME[start + 1:]
    REST = np.append(REST, rest)
 
    ACTIVE_REGION = FILE_NAME[0:start]
    ACTIVE = np.append(ACTIVE, ACTIVE_REGION)

UNIQUE_ACTIVE = np.unique(ACTIVE)
print(len(UNIQUE_ACTIVE))

#### Separate the train and test regions. Active region with max(year) > 2013 belongs in the test data

In [None]:
TRAIN_REGIONS = np.array([])
TEST_REGIONS = np.array([])

for i in range(len(UNIQUE_ACTIVE)):
    PATHS = ALL_PATHS
    #PATHS = glob.glob("../Shared/Magnetogram_Regression/Data/Processed/Valid_Magnetograms/los2/%s_*.png" %UNIQUE_ACTIVE[i])
    YEARS = np.zeros(len(PATHS), dtype=int)
    
    for j in range(0, len(PATHS)):
        FILE_NAME = os.path.split(PATHS[j])[-1]
        start = FILE_NAME.index('_')
        year = FILE_NAME[start+1:start+1+4]
        YEARS[j] = year
    if (max(YEARS) <= 2018):
        TRAIN_REGIONS = np.append(TRAIN_REGIONS, UNIQUE_ACTIVE[i])
    elif(max(YEARS) > 2018):
        TEST_REGIONS = np.append(TEST_REGIONS, UNIQUE_ACTIVE[i])

assert(len(UNIQUE_ACTIVE) == (len(TRAIN_REGIONS) + len(TEST_REGIONS)))

#### Get the train and test file names from the train and test active regions

In [None]:
FINAL_TRAINING_FILES = np.array([])
FINAL_TEST_FILES = np.array([])

for i in range(0, len(ALL_PATHS)):
    FILE_NAME = os.path.split(ALL_PATHS[2])[-1]
    start = FILE_NAME.index('_')
    ACTIVE_REGION = FILE_NAME[0:start]
    if ACTIVE_REGION in TRAIN_REGIONS:
        FINAL_TRAINING_FILES = np.append(FINAL_TRAINING_FILES, ALL_PATHS[i])
    if ACTIVE_REGION in TEST_REGIONS:
        FINAL_TEST_FILES = np.append(FINAL_TEST_FILES, ALL_PATHS[i])
        
print(len(FINAL_TRAINING_FILES))

#### Crosscheck the final train and test years from the train and test file names

In [None]:
TEMP_TRAINING_YEARS = np.array([], dtype=int)

for FTR in FINAL_TRAINING_FILES:
    FILE_NAME = os.path.split(FTR)[-1]
    start = FILE_NAME.index('_')
    year = FILE_NAME[start+1:start+1+4]
    TEMP_TRAINING_YEARS = np.append(TEMP_TRAINING_YEARS, year)
TRAINING_YEARS = np.unique(TEMP_TRAINING_YEARS)

TEMP_TEST_YEARS = np.array([], dtype=int)

for FTE in FINAL_TEST_FILES:
    FILE_NAME = os.path.split(FTE)[-1]
    start = FILE_NAME.index('_')
    year = FILE_NAME[start+1:start+1+4]
    TEMP_TEST_YEARS = np.append(TEMP_TEST_YEARS, year)
TEST_YEARS = np.unique(TEMP_TEST_YEARS)

print(TRAINING_YEARS)
print(TEST_YEARS)

#### Separate flaring and nonflaring train regions

In [None]:
TEMP_FLARING_TRAIN_REGIONS = np.array([])
TEMP_NONFLARING_TRAIN_REGIONS = np.array([])

FLARING_TRAIN_REGIONS = np.array([])
NONFLARING_TRAIN_REGIONS = np.array([])


for i in range(len(FINAL_TRAINING_FILES)):
    PATH = FINAL_TRAINING_FILES[i]
    
    NAME = os.path.split(PATH)[-1]
    
    FLAG = NAME[-5]
    
    start = NAME.index('_')
    ACTIVE_REGION = NAME[0:start]
    
    if (FLAG == '1'):
        TEMP_FLARING_TRAIN_REGIONS = np.append(TEMP_FLARING_TRAIN_REGIONS, ACTIVE_REGION)
    else:
        TEMP_NONFLARING_TRAIN_REGIONS = np.append(TEMP_NONFLARING_TRAIN_REGIONS, ACTIVE_REGION)
        
FLARING_TRAIN_REGIONS = np.unique(TEMP_FLARING_TRAIN_REGIONS)
NONFLARING_TRAIN_REGIONS = np.unique(TEMP_NONFLARING_TRAIN_REGIONS)

if (len(FLARING_TRAIN_REGIONS) % cross_val != 0):
    for i in range(cross_val - len(FLARING_TRAIN_REGIONS) % cross_val):
        FLARING_TRAIN_REGIONS = np.append(FLARING_TRAIN_REGIONS, FLARING_TRAIN_REGIONS[-1])
    
if (len(NONFLARING_TRAIN_REGIONS) % cross_val != 0):
    for i in range(cross_val - len(NONFLARING_TRAIN_REGIONS) % cross_val):
        NONFLARING_TRAIN_REGIONS = np.append(NONFLARING_TRAIN_REGIONS, NONFLARING_TRAIN_REGIONS[-1])

## Important
#### `Use this block if flaring and nonflaring regions are available from the begining =====>`

In [None]:
FLARING_VALIDATION_REGIONS = ['1028', '1041', '1066', '115', '1256', '1350', '1449', '1461',
       '1464', '1582', '1603', '1621', '1722', '1724', '1750', '1834',
       '1879', '1946', '1996', '2040', '211', '2137', '2186', '2191',
       '2193', '2220', '2227', '2362', '245', '2693', '2739', '2748',
       '2760', '2790', '2878', '2887', '2920', '3048', '3056', '3258',
       '3263', '3291', '3295', '3321', '3341', '3364', '3366', '3376',
       '3437', '345', '3497', '3535', '3563', '3587', '362', '3686',
       '3688', '3721', '3740', '3766', '377', '3779', '3784', '3793',
       '3804', '3813', '3836', '3856', '3879', '394', '4000', '401',
       '407', '4071', '4097', '4138', '415', '4186', '4197', '4231',
       '4294', '437', '4639', '4698', '4781', '4874', '49', '4920', '495',
       '4955', '5011', '5026', '5107', '514', '5144', '5233', '5298',
       '54', '5415', '5446', '5447', '5526', '5637', '5653', '5673',
       '5885', '667', '746', '753', '814', '856', '878', '892']

NONFLARING_TRAIN_REGIONS = ['1', '1001', '1019', '1021', '1026', '104', '1062', '1075', '1079',
       '1080', '1090', '1092', '1093', '1113', '1119', '1120', '1124',
       '114', '1149', '1186', '12', '1210', '1221', '1237', '1249',
       '1275', '1278', '128', '1300', '1303', '1309', '1312', '1313',
       '1318', '1338', '1339', '1342', '1348', '135', '1367', '1390',
       '1391', '1396', '1399', '1405', '1422', '1424', '1425', '1447',
       '1455', '1457', '1465', '1484', '1488', '1514', '1520', '1528',
       '156', '1573', '1578', '1596', '1628', '1632', '1634', '1642',
       '1644', '1653', '1662', '1677', '1688', '1705', '1711', '1715',
       '1727', '1795', '1819', '1832', '1863', '1866', '187', '1873',
       '1886', '1892', '1903', '1931', '1951', '1970', '1979', '1990',
       '2007', '2011', '2017', '2021', '2028', '2039', '2044', '2047',
       '2059', '2061', '2069', '2098', '2106', '2109', '2112', '2130',
       '218', '2181', '220', '224', '2240', '2245', '226', '2262', '2291',
       '2306', '2314', '2337', '2338', '2341', '2342', '2344', '2352',
       '2353', '2360', '2380', '2387', '241', '2411', '2414', '2433',
       '2450', '2460', '2469', '2489', '2492', '2501', '2504', '2511',
       '252', '2520', '2541', '2557', '256', '2571', '2573', '2581',
       '2583', '2585', '2587', '259', '2597', '26', '2625', '2634',
       '2651', '2661', '2672', '2677', '2685', '2691', '2696', '2718',
       '2727', '2732', '2733', '2737', '274', '2749', '2750', '2758',
       '2779', '279', '2822', '2832', '284', '2852', '2904', '2912',
       '2952', '2964', '2966', '2968', '297', '2981', '2984', '3012',
       '3019', '3022', '3031', '3032', '3049', '3066', '3068', '3082',
       '3097', '3098', '3114', '3115', '3119', '3122', '3129', '3149',
       '3154', '3194', '3195', '3199', '3217', '3220', '323', '3240',
       '3244', '3247', '3248', '325', '3259', '3273', '3293', '3330',
       '3336', '3368', '3371', '3400', '3432', '3448', '3457', '3461',
       '3473', '3474', '3483', '3515', '3542', '3560', '3586', '3601',
       '3608', '3620', '3631', '3647', '3668', '367', '3711', '3753',
       '38', '3823', '3824', '3826', '3843', '3845', '3848', '3901',
       '3907', '3912', '3921', '3926', '3957', '3978', '3982', '3985',
       '3996', '4025', '4038', '4040', '4042', '4073', '4075', '4076',
       '4088', '4108', '4111', '4123', '4131', '4133', '4156', '4166',
       '4190', '4201', '4205', '421', '4218', '4252', '4256', '4265',
       '4272', '4287', '429', '4296', '4328', '4335', '4351', '4375',
       '4379', '438', '4383', '4390', '4398', '4399', '443', '4438',
       '4440', '4447', '4448', '4455', '4469', '4478', '4502', '451',
       '4530', '4536', '4539', '4543', '4551', '4552', '4556', '4566',
       '4574', '4576', '4579', '4580', '4591', '4603', '4616', '4640',
       '466', '4667', '4673', '4702', '4704', '4711', '4724', '4734',
       '475', '4751', '4760', '4761', '4764', '4767', '4783', '4799',
       '480', '4802', '4851', '4862', '4864', '4868', '4872', '4882',
       '4888', '4889', '4908', '4921', '4932', '4954', '4962', '4963',
       '4973', '4978', '5005', '5012', '5028', '5036', '5039', '504',
       '5073', '5075', '5103', '5111', '5112', '5118', '5135', '5140',
       '5151', '5152', '5183', '5198', '5212', '5229', '5230', '5246',
       '5249', '5265', '5284', '5315', '532', '5337', '5342', '5347',
       '5351', '5354', '5366', '5374', '5385', '540', '5456', '5462',
       '5467', '5472', '5484', '5490', '5492', '5500', '5518', '5534',
       '5535', '5537', '5545', '5549', '5559', '556', '5571', '5586',
       '5596', '5598', '5627', '5635', '5644', '5658', '5677', '57',
       '5724', '5739', '5758', '5783', '5789', '5807', '5808', '5811',
       '5818', '5823', '5848', '5852', '587', '5890', '5894', '5908',
       '5919', '5927', '598', '602', '605', '606', '610', '622', '639',
       '643', '650', '661', '681', '684', '685', '686', '695', '702',
       '71', '713', '714', '759', '764', '765', '798', '803', '805',
       '812', '843', '847', '850', '851', '869', '875', '903', '909',
       '92', '921', '926', '927', '932', '948', '956', '970', '971',
       '973', '975', '976', '982', '997']

FLARING_TRAIN_REGIONS = np.array(FLARING_TRAIN_REGIONS)
NONFLARING_TRAIN_REGIONS = np.array(NONFLARING_TRAIN_REGIONS)

if (len(FLARING_TRAIN_REGIONS) % cross_val != 0):
    for i in range(cross_val - len(FLARING_TRAIN_REGIONS) % cross_val):
        FLARING_TRAIN_REGIONS = np.append(FLARING_TRAIN_REGIONS, FLARING_TRAIN_REGIONS[-1])
    
if (len(NONFLARING_TRAIN_REGIONS) % cross_val != 0):
    for i in range(cross_val - len(NONFLARING_TRAIN_REGIONS) % cross_val):
        NONFLARING_TRAIN_REGIONS = np.append(NONFLARING_TRAIN_REGIONS, NONFLARING_TRAIN_REGIONS[-1])
        
        
ALL_REGIONS = np.concatenate([FLARING_TRAIN_REGIONS, NONFLARING_TRAIN_REGIONS])

FINAL_TRAINING_FILES = glob.glob('../shared/Data/HMI_LOS_SHARPS/valid_magnetograms/los/*.dat')

In [None]:



validationRegions = ['1209', '1321', '1500', '1638', '1806', '1807', '1907', '1930',
       '1993', '1999', '2372', '2491', '2519', '2546', '2635', '2636',
       '2673', '2716', '2809', '3311', '3344', '3520', '3580', '3730',
       '384', '3877', '3894', '392', '393', '3941', '3999', '4344',
       '4396', '4817', '4941', '5127', '5186', '5541', '5692', '5738',
       '5745', '637', '750', '8', '833', '899', '902', '940', '1038', '1046', '107', '1089', '1126', '1133', '116', '1168',
       '1171', '1183', '1271', '1345', '1353', '1389', '1410', '145',
       '146', '1471', '1483', '1492', '1527', '1549', '1557', '1558',
       '1574', '1611', '1613', '1658', '1669', '1672', '1697', '1701',
       '1744', '175', '1756', '1845', '185', '1877', '1893', '190',
       '1949', '1959', '1962', '198', '2026', '2037', '2110', '2117',
       '2121', '2123', '2131', '2143', '2166', '2173', '2178', '2203',
       '2270', '2358', '2366', '2400', '2420', '2502', '2522', '2533',
       '2543', '2560', '2598', '2599', '2605', '2619', '2663', '2711',
       '2735', '2825', '2861', '2875', '2922', '2945', '2954', '2955',
       '2999', '3028', '3103', '318', '3205', '3246', '3252', '3267',
       '327', '3286', '3288', '3309', '3323', '3326', '3415', '3420',
       '3481', '3490', '3513', '355', '3604', '3635', '364', '3648',
       '3700', '3703', '3719', '3741', '3785', '3821', '3874', '3942',
       '3965', '3974', '4011', '403', '4065', '4092', '4093', '414',
       '4228', '4284', '4288', '43', '4315', '4321', '4337', '4397',
       '4424', '444', '4454', '4466', '4477', '4505', '4523', '4541',
       '4549', '4559', '46', '4610', '4623', '4655', '4661', '4678',
       '4718', '4726', '4792', '4800', '4900', '4942', '4943', '4969',
       '4991', '4995', '5002', '5004', '5022', '5051', '5054', '51',
       '5113', '5163', '5208', '5275', '5293', '5355', '5375', '538',
       '5387', '5413', '5422', '5521', '5543', '5544', '5550', '5577',
       '5618', '5678', '5710', '5718', '5750', '576', '5772', '580',
       '5820', '5831', '5856', '5865', '5880', '5916', '618', '625',
       '640', '652', '662', '674', '705', '712', '725', '740', '794',
       '824', '853', '854', '86', '867', '900', '913', '918', '925',
       '950', '986']




#### Separate the flaring and nonflaring train regions into five equal parts. First four parts will be the TRAIN regions and the last parts will be the VALIDATION regions. Repeat the steps for 5-fold cross-validation 

In [None]:
FL = len(np.concatenate(np.split(FLARING_TRAIN_REGIONS,cross_val)[0:cross_val - 1]))
FLV = len(np.split(FLARING_TRAIN_REGIONS, cross_val)[-1])

NL = len(np.concatenate(np.split(NONFLARING_TRAIN_REGIONS,cross_val)[0:cross_val - 1]))
NLV = len(np.split(NONFLARING_TRAIN_REGIONS, cross_val)[-1])

R_FL_TRAIN = np.empty((cross_val, FL), dtype = 'S8')
R_FL_VALIDATION = np.empty((cross_val, FLV), dtype = 'S8')

R_NFL_TRAIN = np.empty((cross_val, NL), dtype = 'S8')
R_NFL_VALIDATION = np.empty((cross_val, NLV), dtype = 'S8')

for i in range(cross_val):
    FLARING_TRAIN_REGIONS = np.roll(FLARING_TRAIN_REGIONS, len(FLARING_TRAIN_REGIONS)//cross_val)
    NONFLARING_TRAIN_REGIONS = np.roll(NONFLARING_TRAIN_REGIONS, len(NONFLARING_TRAIN_REGIONS)//cross_val)
    
    R_FL = np.split(FLARING_TRAIN_REGIONS, cross_val)
    R_NFL = np.split(NONFLARING_TRAIN_REGIONS, cross_val)
    
    T_FL = R_FL[0:cross_val - 1]
    V_FL = R_FL[-1]
    
    T_NFL = R_NFL[0:cross_val - 1]
    V_NFL = R_NFL[-1]
    
    R_FL_TRAIN[i] = np.concatenate(T_FL)
    R_FL_VALIDATION[i] = V_FL
    
    R_NFL_TRAIN[i] = np.concatenate(T_NFL)
    R_NFL_VALIDATION[i] = V_NFL
    
print(R_FL_TRAIN)

#### Desc

In [None]:
TEMP_NF_TRAIN = {}
NF_TRAIN = {}
F_TRAIN = {}


F_VALIDATION = np.array([])
NF_VALIDATION = np.array([])

CV_F_TRAIN = {}
CV_NF_TRAIN = {}
CV_F_VALIDATION = {}
CV_NF_VALIDATION = {}

for j in range(cross_val):
    TEMP_F_TRAIN = np.array([])
    TEMP_NF_TRAIN = np.array([])
    TEMP_F_VALIDATION = np.array([])
    TEMP_NF_VALIDATION = np.array([])
    for i in range(len(FINAL_TRAINING_FILES)):
        NAME = os.path.split(FINAL_TRAINING_FILES[i])[-1]
        start = NAME.index('_')
        ACTIVE_REGION = NAME[0:start]
        if ACTIVE_REGION in R_FL_TRAIN[j]:
            TEMP_F_TRAIN = np.append(TEMP_F_TRAIN, FINAL_TRAINING_FILES[i])
        elif ACTIVE_REGION in R_NFL_TRAIN[j]:
            TEMP_NF_TRAIN = np.append(TEMP_NF_TRAIN, FINAL_TRAINING_FILES[i])
        elif ACTIVE_REGION in R_FL_VALIDATION[j]:
            TEMP_F_VALIDATION = np.append(TEMP_F_VALIDATION, FINAL_TRAINING_FILES[i])
        elif ACTIVE_REGION in R_NFL_VALIDATION[j]:
            TEMP_NF_VALIDATION = np.append(TEMP_NF_VALIDATION, FINAL_TRAINING_FILES[i])

    CV_F_TRAIN[str(j)] = TEMP_F_TRAIN
    CV_NF_TRAIN[str(j)] = TEMP_NF_TRAIN
    CV_F_VALIDATION[str(j)] = TEMP_F_VALIDATION
    CV_NF_VALIDATION[str(j)] = TEMP_NF_VALIDATION


TRAIN = {}
VALIDATION = {}

for i in range(5):
    np.random.shuffle(CV_NF_TRAIN[str(i)])
    np.random.shuffle(CV_F_TRAIN[str(i)])
    np.random.shuffle(CV_NF_VALIDATION[str(i)])
    np.random.shuffle(CV_F_VALIDATION[str(i)])

    
"""  
for i in range(5):
    N = len(CV_NF_TRAIN[str(i)])/len(CV_F_TRAIN[str(i)])
    D = len(CV_NF_TRAIN[str(i)]) - len(CV_F_TRAIN[str(i)])*N
    TEMP = CV_F_TRAIN[str(i)]
    for j in range(N-1):
        CV_F_TRAIN[str(i)] = np.append(CV_F_TRAIN[str(i)], TEMP)
    CV_F_TRAIN[str(i)] = np.append(CV_F_TRAIN[str(i)], TEMP[0:D])
"""
    
    
for i in range(cross_val):
    TRAIN[str(i)] = np.append(CV_F_TRAIN[str(i)], CV_NF_TRAIN[str(i)])
    np.random.shuffle(TRAIN[str(i)])
    VALIDATION[str(i)] = np.append(CV_F_VALIDATION[str(i)], CV_NF_VALIDATION[str(i)])
    np.random.shuffle(VALIDATION[str(i)])

#### Desc

In [None]:
np.save('train_all.npy', TRAIN)
np.save('validation_all.npy', VALIDATION)

#### Desc

In [None]:
ALL_PATHS = np.append(TRAIN['0'], VALIDATION['0'])

MEAN = 0.0

for i in range(len(ALL_PATHS)):
    IMAGE = np.load(ALL_PATHS[i])
    MEAN += np.mean(IMAGE)
    
mean = MEAN / len(ALL_PATHS)

sum = 0.0
N = 256*256*len(ALL_PATHS)

for F in ALL_PATHS:
    IMAGE = np.load(F)
    sum += ((IMAGE - mean)**2).sum()

std = np.sqrt(sum/N)

In [None]:
## temp section

allPaths = np.array([], dtype='')
for activeRegion in regions:
    pathsString = "../shared/Data/HMI_LOS_SHARPS/valid_magnetograms/los2/%s_*.dat" %activeRegion
    paths = glob.glob(pathsString)
    np.concatenate(allPaths, paths)

MEAN = 0.0

for i in range(len(allPaths)):
    IMAGE = np.load(allPaths[i])
    MEAN += np.mean(IMAGE)
    
mean = MEAN / len(allPaths)

sum = 0.0
N = 256*256*len(allPaths)

for F in allPaths:
    IMAGE = np.load(F)
    sum += ((IMAGE - mean)**2).sum()

std = np.sqrt(sum/N)

#### Desc

In [None]:
mean.dump('mean_all.dat')
std.dump('std_all.dat')

In [None]:
ALL_FILES = glob.glob("../shared/Data/HMI_LOS_SHARPS/valid_magnetograms/los2/*.dat")
for i in range()