# 0. Import dependencies

In [2]:
import pandas as pd
import numpy as np
import os
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import RobustScaler


# 1. Data preprocessing

Forming folder tree

In [3]:
rootdir = "NeuroScience Data"
classes_tuple = ("a15", "a25", "a40", "a45", "a55", "a60", "a75", "a85")
patients_tuple = tuple(range(1, 21))

# This is the main object with data
classes_data = {}                       # Dict with lists


def form_data_tree(rootdir: str, classes_data: dict):
    for dirs, subdirs, files in os.walk(rootdir):
        key = dirs.split('/')[-1]
        if key in classes_tuple:
            patients_dict = {}
            for i in patients_tuple:
                patients_dict[i] = []
            classes_data[key] = patients_dict
            for file in files:
                patient_number = int(file.split('_')[1])
                data_link = os.path.join(dirs, file)
                classes_data[key][patient_number].append(data_link)
    return classes_data


classes_data = form_data_tree(rootdir=rootdir, classes_data=classes_data)


In [4]:
len(classes_data)

8

In [5]:
len(classes_data["a15"])

20

We'we packed our filenames in a nested dictionary. In total we will have 8 * 20 = 160 list of filenames.

In [6]:
# Labels of classification
classes = {"a15": 0, "a25": 1, "a40": 2, "a45": 3, "a55": 4, "a60": 5, "a75": 6, "a85": 7}

In [7]:
def prefix_df(df: pd.DataFrame):
    """
        Function, that fixes DF, but without 
        adding extra columns    
    """

    # transposing and setting correct index names
    df = df.T
    correct_indexes = [i+1 for i in range(len(df.columns))]
    df.columns = correct_indexes
    df[0] = df.index
    df = df.reset_index()

    # removing ['index'] column, setting the correct datatype and element order
    df = df.drop(['index'], axis=1)
    correct_indexes = df.columns.to_list()
    correct_indexes = correct_indexes[-1:] + correct_indexes[:-1]
    df = df[correct_indexes]

    # In case of errors='coerce', error-cell will
    # be turned into nan. Then it will be handled

    df[0] = pd.to_numeric(df[0], errors='coerce')
    df = df.fillna(df.mean())

    return df

**Exploratory Data Analysis** of original (Not fixed) dataset.

In [8]:
# Calculating mean length of the dataframe (NOT fixed)

def calc_len(folder_tree):
    length_list = []
    for i in folder_tree.values():
            for j in i.values():
                for path in j:
                    dataframe = pd.read_csv(path)
                    length_list.append(dataframe.shape[-1])
    return length_list

length_list = calc_len(classes_data)

len(length_list)

4000

In [9]:
print(f'Mean trial length in Original Dataset - {np.mean(length_list)}')

Mean trial length in Original Dataset - 753.51575


So, the average lenght of a trial is a bit more 750.

In [10]:
length_df = pd.DataFrame(length_list)
length_df.describe()

Unnamed: 0,0
count,4000.0
mean,753.51575
std,92.310043
min,523.0
25%,686.0
50%,726.0
75%,795.0
max,1438.0


**So, using maxlen of 1600 was a MISTAKE !** 1500 is enought!

In [11]:
# Analyzing mean value

def analyze_mean(classes_data:dict):
    mean_value_dict = {key:[] for key in range(0,31) }
    for i in classes_data.values():
        for j in i.values():
            for path in j:
                dataframe = pd.read_csv(path)
                dataframe = prefix_df(dataframe)
                for k in range(0,31):
                    mean_value_dict[k].append(dataframe[k].mean())
    return mean_value_dict

mean_value_dict = analyze_mean(classes_data = classes_data)

len(mean_value_dict[0])

4000

In [12]:
mean_value_df = pd.DataFrame.from_dict(mean_value_dict)
mean_value_df.shape

(4000, 31)

In [13]:
mean_value_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       4000 non-null   float64
 1   1       4000 non-null   float64
 2   2       4000 non-null   float64
 3   3       4000 non-null   float64
 4   4       4000 non-null   float64
 5   5       4000 non-null   float64
 6   6       4000 non-null   float64
 7   7       4000 non-null   float64
 8   8       4000 non-null   float64
 9   9       4000 non-null   float64
 10  10      4000 non-null   float64
 11  11      4000 non-null   float64
 12  12      4000 non-null   float64
 13  13      4000 non-null   float64
 14  14      4000 non-null   float64
 15  15      4000 non-null   float64
 16  16      4000 non-null   float64
 17  17      4000 non-null   float64
 18  18      4000 non-null   float64
 19  19      4000 non-null   float64
 20  20      4000 non-null   float64
 21  21      4000 non-null   float64
 22  

In [14]:
# Description of MEAN value df
mean_value_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,...,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,0.154133,0.631962,0.384527,0.681922,-0.407398,-0.382884,-0.389077,-0.779942,-0.823101,-0.992245,...,-0.583999,-0.590345,-0.10111,-0.499566,-0.558685,-0.258905,0.113703,0.153763,-0.052569,-0.192344
std,9.364976,7.981514,12.179935,7.196085,6.166092,4.658057,7.538777,7.686071,13.33226,15.63749,...,5.863318,5.92576,5.536659,5.969458,7.21181,5.770017,6.889838,10.554924,6.538321,4.284639
min,-293.335012,-82.055819,-698.053474,-71.907215,-87.980678,-32.716988,-110.848309,-79.581653,-133.750726,-186.96009,...,-65.48256,-53.723935,-42.206361,-49.380365,-78.978398,-64.818762,-98.29594,-422.916369,-88.489108,-53.459643
25%,-1.63716,-1.550245,-1.860815,-2.086,-2.814175,-2.638467,-2.891382,-3.296282,-3.51826,-4.174506,...,-2.403102,-2.878707,-3.103266,-2.890701,-2.579325,-1.924929,-2.261241,-2.532849,-2.30843,-1.875514
50%,0.440034,0.523899,0.640158,0.605557,-0.352208,-0.374415,-0.39643,-0.508,-0.46216,-0.617167,...,-0.369361,-0.468896,-0.129581,-0.336268,-0.523236,-0.084015,0.141025,0.323373,0.039213,-0.060393
75%,2.562277,2.719544,3.124651,3.211678,1.981446,1.854122,2.044512,2.159529,2.494097,2.76001,...,1.563198,1.755516,2.754337,1.904056,1.539822,1.727683,2.366314,3.076322,2.419318,1.710071
max,81.392843,153.51017,53.053846,105.128061,86.275331,49.514409,160.80127,81.392324,227.266051,267.523862,...,51.216221,63.942665,53.16438,70.863807,183.730361,55.679172,106.659832,199.294338,104.151937,54.283138


Let's apply **Robust Scaler** to the original data

In [15]:
scaled_path = 'Scaled NeuroScience Data'

scaled_classes_data = {}
scaled_classes_data = form_data_tree(rootdir = scaled_path, classes_data = scaled_classes_data)
scaler = RobustScaler()

def scale(data_tree: dict, is_fixed: bool, scaler):
    '''
        Applies scaler to whole folder
    of .csv files.
    '''
    for i in data_tree.values():
        for j in i.values():
            for path in j:
                dataframe = pd.read_csv(path)
                if is_fixed == False:
                    dataframe = prefix_df(dataframe)
                dataframe = scaler.fit_transform(dataframe)
                dataframe = pd.DataFrame(dataframe)
                os.remove(path)
                dataframe.to_csv(path_or_buf=path, index=False)

# scale(data_tree = scaled_classes_data, is_fixed = False, scaler = scaler)         

In [16]:
# Analyzing mean value

mean_value_dict = analyze_mean(classes_data=scaled_classes_data)

len(mean_value_dict[0])

4000

In [17]:
mean_value_df = pd.DataFrame.from_dict(mean_value_dict)
mean_value_df.describe()

# Probably, [0] columns is a mistake

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,...,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,15.0,0.001077,0.002685,0.004679,0.006253,0.006939,0.006724,0.005784,0.004119,0.00153,...,-0.00764,-0.008264,-0.006234,-0.002764,0.000609,0.002989,0.004364,0.005225,0.006159,0.00741
std,0.0,0.518557,0.517185,0.517267,0.517031,0.515743,0.51412,0.512756,0.511732,0.511282,...,0.504817,0.507003,0.509735,0.50967,0.507161,0.506125,0.50773,0.509196,0.508796,0.508575
min,15.0,-2.713557,-2.5573,-2.538022,-2.550379,-2.449986,-2.353582,-2.637953,-2.661359,-3.64048,...,-2.261268,-2.570341,-3.096574,-3.523317,-3.193648,-2.455635,-2.539223,-2.517105,-2.386607,-2.210557
25%,15.0,-0.328713,-0.329448,-0.317832,-0.320112,-0.315101,-0.317002,-0.317101,-0.309497,-0.316514,...,-0.332715,-0.329579,-0.332729,-0.32901,-0.323612,-0.323711,-0.320464,-0.322545,-0.324867,-0.314301
50%,15.0,0.003951,0.006885,0.003547,-0.001856,-0.005377,0.000526,0.001841,0.004634,0.002888,...,-0.010647,-0.012151,0.00178,0.000676,0.006782,0.015306,0.00848,0.015968,0.009785,0.010236
75%,15.0,0.339922,0.328249,0.330558,0.336866,0.337414,0.333875,0.333623,0.326776,0.325235,...,0.321372,0.318422,0.322607,0.32617,0.318626,0.326093,0.32179,0.326925,0.329795,0.32567
max,15.0,3.094014,2.983935,2.778666,2.746369,2.344475,2.415844,2.273476,2.21906,2.374896,...,2.469884,3.203322,3.945725,4.494943,4.683987,4.472529,3.99987,3.525868,3.273199,3.282234


Unifiing the length of dataframe.

In [18]:
# Rereading dataframe
df = pd.read_csv(classes_data["a15"][1][0])

# This constants have been calculated before
MAX_LEN = 1500
LONGEST_FILE_PATH = "NeuroScience Data/a15/a15_14_101.csv"


In [19]:
# function for making zero lists of n length

def zerolistmaker(n):
    listofzeros = [0] * n
    return listofzeros

# main preprocessing function


def fix_df(df: pd.DataFrame, zero_list_function, MAX_LEN: int):

    # transposing and setting correct index names
    df = df.T
    correct_indexes = [i+1 for i in range(len(df.columns))]
    df.columns = correct_indexes
    df[0] = df.index
    df = df.reset_index()

    # removing ['index'] column, setting the correct datatype and element order
    df = df.drop(['index'], axis=1)
    correct_indexes = df.columns.to_list()
    correct_indexes = correct_indexes[-1:] + correct_indexes[:-1]
    df = df[correct_indexes]

    # In case of errors='coerce', error-cell will
    # be turned into nan. Then it will be handled

    df[0] = pd.to_numeric(df[0], errors='coerce')
    df = df.fillna(df.mean())

    # ADD EXTRA ROWS HERE
    foundation = [zero_list_function(31) for i in range(MAX_LEN - len(df))]
    foundation = pd.DataFrame(foundation)
    df = df.append(foundation, ignore_index=True)

    return df


In [20]:
# df = fix_df(df, zerolistmaker, MAX_LEN)

df = prefix_df(df)
df.shape

(696, 31)

# 2. Dividing exp_trials for fractions

### Data splitting:
* Divide by 3/5/10.
* Filter out empty (or almost empty) files.

In [21]:
MAX_LEN = 1500

data_split_2 = 'NeuroScience Data split/data split 2'
data_split_3 = 'NeuroScience Data split/data split 3'
data_split_5 = 'NeuroScience Data split/data split 5'
data_split_10 = 'NeuroScience Data split/data split 10'

classes_split_2 = {}
classes_split_3 = {}
classes_split_5 = {}
classes_split_10 = {}


In [22]:
def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))


def split_df(df, n,  path, node_list):
    '''
        example_df = df
        n = 3
        path = path
        path_list = j
    '''
    seq = np.arange(n)

    fractions = list(split(df, n))
    for i, j in zip(fractions, seq):
        # VERY SIMPLE filtering
        if i[0].mean() != 0.0:
            new_path = path.split('.csv')[0] + '_' + str(j) + '.csv'
            i.to_csv(path_or_buf=new_path, index=False)
            node_list.append(new_path)
    os.remove(path)


In [23]:
def split_folder(folder, n):
    for i in folder.values():
        for j in i.values():
            original_j = j
            poll = range(len(j))
            for k in poll:
                df = pd.read_csv(j[k])
                df = fix_df(df, zerolistmaker, MAX_LEN)
                split_df(df=df, n=n, path=j[k], node_list=j)
            for k in original_j:
                if k in j:
                    j.remove(k)


In [24]:
def count_dfs(folder_tree):
    count = 0
    for i in folder_tree.values():
        for j in i.values():
            for path in j:
                count += 1
    return count

In [25]:
classes_split_2 = form_data_tree(
    classes_data=classes_split_2, rootdir=data_split_2)

# split_folder(folder=classes_split_2, n=2)

classes_split_2 = form_data_tree(
    classes_data=classes_split_2, rootdir=data_split_2)

count = count_dfs(classes_split_2)

print(f'Number of .csv files in split 2 BEFORE CLEANING: {count}')

Number of .csv files in split 2 BEFORE CLEANING: 4000


In [26]:
# Removing .csv, that are likely be ALMOST 0

def remove_extra(folder_tree, pattern):
    for i in folder_tree.values():
        for j in i.values():
            for path in j:
                if path.split('_')[-1][0] == pattern:
                    os.remove(path)


In [27]:
# remove_extra(classes_split_2, '1')

classes_split_2 = form_data_tree(
    classes_data=classes_split_2, rootdir=data_split_2)

count_2 = count_dfs(classes_split_2)

print(f'Number of .csv files in split 2 AFTER CLEANING: {count_2}')
print(f'Average length of a path list in split 2: {count_2 / (20*8)}')
print(f'Length of a .csv in split 2: {int(MAX_LEN/2)}')

Number of .csv files in split 2 AFTER CLEANING: 4000
Average length of a path list in split 2: 25.0
Length of a .csv in split 2: 750


In [28]:
classes_split_3 = form_data_tree(
    classes_data=classes_split_3, rootdir=data_split_3)

# split_folder(folder=classes_split_3, n=3)

# classes_split_3 = form_data_tree(
#     classes_data=classes_split_3, rootdir=data_split_3)

count_3 = count_dfs(classes_split_3)

print(f'Number of .csv files in split 3 BEFORE CLEANING: {count_3}')
print(f'Average length of a path list in split 3: {count_3 / (20*8)}')
print(f'Length of a .csv in split 3: {int(MAX_LEN/3)}')


Number of .csv files in split 3 BEFORE CLEANING: 8057
Average length of a path list in split 3: 50.35625
Length of a .csv in split 3: 500


In [29]:
classes_split_5 = form_data_tree(
    classes_data=classes_split_5, rootdir=data_split_5)

# split_folder(folder=classes_split_5, n=5)

# classes_split_5 = form_data_tree(
#     classes_data=classes_split_5, rootdir=data_split_5)

count_5 = count_dfs(classes_split_5)

print(f'Number of .csv files in split 5 BEFORE CLEANING: {count_5}')
print(f'Average length of a path list in split 5: {count_5 / (20*8)}')
print(f'Length of a .csv in split 5: {int(MAX_LEN/5)}')


Number of .csv files in split 5 BEFORE CLEANING: 12410
Average length of a path list in split 5: 77.5625
Length of a .csv in split 5: 300


In [30]:
classes_split_10 = form_data_tree(
    classes_data=classes_split_10, rootdir=data_split_10)

# split_folder(folder=classes_split_10, n=10)

# classes_split_10 = form_data_tree(
#     classes_data=classes_split_10, rootdir=data_split_10)

count_10 = count_dfs(classes_split_10)

print(f'Number of .csv files in split 10 BEFORE CLEANING: {count_10}')
print(f'Average length of a path list in split 10: {count_10 / (20*8)}')
print(f'Length of a .csv in split 5: {int(MAX_LEN/10)}')


Number of .csv files in split 10 BEFORE CLEANING: 22003
Average length of a path list in split 10: 137.51875
Length of a .csv in split 5: 150


### Scale divided data

In [31]:
scaled_split_2 = 'Scaled NeuroScience Data split/data split 2'
scaled_split_3 = 'Scaled NeuroScience Data split/data split 3'
scaled_split_5 = 'Scaled NeuroScience Data split/data split 5'
scaled_split_10 = 'Scaled NeuroScience Data split/data split 10'

classes_scaled_2 = {}
classes_scaled_3 = {}
classes_scaled_5 = {}
classes_scaled_10 = {}



In [32]:
# 2

classes_scaled_2 = form_data_tree(rootdir=scaled_split_2, classes_data=classes_scaled_2)

# scale(data_tree = classes_scaled_2, is_fixed = True, scaler = scaler)

In [33]:
# 3

classes_scaled_3 = form_data_tree(rootdir=scaled_split_3, classes_data=classes_scaled_3)

# scale(data_tree = classes_scaled_3, is_fixed = True, scaler = scaler)

In [34]:
# 5

classes_scaled_5 = form_data_tree(rootdir=scaled_split_5, classes_data=classes_scaled_5)

# scale(data_tree = classes_scaled_5, is_fixed = True, scaler = scaler)

In [35]:
# 10

classes_scaled_10 = form_data_tree(rootdir=scaled_split_10, classes_data=classes_scaled_10)

# scale(data_tree = classes_scaled_10, is_fixed = True, scaler = scaler)

# 3. Custom Cross-Validation and splitting.

**Making custom 5-fold cross-validation(CV).**

In [36]:
def divide_to_folds(folder_tree, files_in_a_tree):

    number_of_folds = 5
    elements_per_list = files_in_a_tree/(20*8)
    part = int(elements_per_list//number_of_folds)

    fold_1, fold_2, fold_3, fold_4, fold_5 = [], [], [], [], []

    # Classes_data - original Dataset:
    # It consists of 8*20 list of len = 25

    for i in folder_tree.values():
        for j in i.values():
            # print(len(j[:number_of_folds]))
            # wrap in into loop
            for filename in j[:part]:
                fold_1.append(filename)
            for filename in j[part: 2 * part]:
                fold_2.append(filename)
            for filename in j[2 * part: 3 * part]:
                fold_3.append(filename)
            for filename in j[3 * part: 4 * part]:
                fold_4.append(filename)
            for filename in j[4 * part:]:
                fold_5.append(filename)

    return (fold_1, fold_2, fold_3, fold_4, fold_5)



In [37]:
folds_slice = divide_to_folds(classes_data, 4000)
folds_slice_2 = divide_to_folds(classes_split_2, count_2)
folds_slice_3 = divide_to_folds(classes_split_3, count_3)
folds_slice_5 = divide_to_folds(classes_split_5, count_5)

for i in folds_slice:
    print(len(i))

for i in folds_slice_2:
    print(len(i))

for i in folds_slice_3:
    print(len(i))

for i in folds_slice_5:
    print(len(i))

800
800
800
800
800
800
800
800
800
800
1600
1600
1600
1600
1657
2400
2400
2400
2400
2810


Let's check, is our distribution OF PATIENTS even or not IN EVERY ITERATION of CUSTOM CV?

In [38]:
class CV_iteration():
    def __init__(self, name: str, train_set: list, valid_set: list, test_set: list):
        self.name = name
        self.train_set = train_set
        self.valid_set = valid_set
        self.test_set = test_set


# EXAMPLE OF 5 FOLD CROSS-VALIDATION.
# LATER, IT WILL BE WRAPPED INTO A FUNCTION.

# iteration_1 = CV_iteration('iteration_1', train_set=fold_1+fold_2 +
#                            fold_3, valid_set=fold_4, test_set=fold_5)
# 
# iteration_2 = CV_iteration('iteration_2', train_set=fold_2+fold_3 +
#                            fold_4, valid_set=fold_5, test_set=fold_1)
# 
# iteration_3 = CV_iteration('iteration_3', train_set=fold_3+fold_4 +
#                            fold_5, valid_set=fold_1, test_set=fold_2)
# 
# iteration_4 = CV_iteration('iteration_4', train_set=fold_4+fold_5 +
#                            fold_1, valid_set=fold_2, test_set=fold_3)
# 
# iteration_5 = CV_iteration('iteration_5', train_set=fold_5+fold_1 +
#                            fold_2, valid_set=fold_3, test_set=fold_4)


In [39]:
iteration_original = CV_iteration('original', train_set=folds_slice[0]+folds_slice[1] +
                                  folds_slice[2], valid_set=folds_slice[3], test_set=folds_slice[4])

iteration_split_2 = CV_iteration('split 2', train_set=folds_slice_2[0]+folds_slice_2[1] +
                                 folds_slice_2[2], valid_set=folds_slice_2[3], test_set=folds_slice_2[4])

iteration_split_3 = CV_iteration('split 3', train_set=folds_slice_3[0]+folds_slice_3[1] +
                                 folds_slice_3[2], valid_set=folds_slice_3[3], test_set=folds_slice_3[4])

iteration_split_5 = CV_iteration('split 5', train_set=folds_slice_5[0]+folds_slice_5[1] +
                                 folds_slice_5[2], valid_set=folds_slice_5[3], test_set=folds_slice_5[4])


In [40]:
def evaluate_iteration(iteration: CV_iteration):
    def check_distribution(part_of_dataset):
        patients_dict = {}
        for i in patients_tuple:
            patients_dict[i] = 0

        for i in patients_dict:
            for j in part_of_dataset:
                if int(j.split('_')[1]) == i:               # Getting the patient №
                    patients_dict[i] = patients_dict[i] + 1
        return patients_dict
    print(f'''Distribution of patients among train set of {iteration.name} is \n {check_distribution(iteration.train_set)} \n
              Distribution of patients among valid set of {iteration.name} is \n {check_distribution(iteration.valid_set)} \n
              Distribution of patients among test set of {iteration.name} is \n {check_distribution(iteration.test_set)} \n ''')


As we can see, now patients have equal distribution among datasets

In [41]:
evaluate_iteration(iteration_original)
evaluate_iteration(iteration_split_2)
evaluate_iteration(iteration_split_3)
evaluate_iteration(iteration_split_5)



Distribution of patients among train set of original is 
 {1: 120, 2: 120, 3: 120, 4: 120, 5: 120, 6: 120, 7: 120, 8: 120, 9: 120, 10: 120, 11: 120, 12: 120, 13: 120, 14: 120, 15: 120, 16: 120, 17: 120, 18: 120, 19: 120, 20: 120} 

              Distribution of patients among valid set of original is 
 {1: 40, 2: 40, 3: 40, 4: 40, 5: 40, 6: 40, 7: 40, 8: 40, 9: 40, 10: 40, 11: 40, 12: 40, 13: 40, 14: 40, 15: 40, 16: 40, 17: 40, 18: 40, 19: 40, 20: 40} 

              Distribution of patients among test set of original is 
 {1: 40, 2: 40, 3: 40, 4: 40, 5: 40, 6: 40, 7: 40, 8: 40, 9: 40, 10: 40, 11: 40, 12: 40, 13: 40, 14: 40, 15: 40, 16: 40, 17: 40, 18: 40, 19: 40, 20: 40} 
 
Distribution of patients among train set of split 2 is 
 {1: 120, 2: 120, 3: 120, 4: 120, 5: 120, 6: 120, 7: 120, 8: 120, 9: 120, 10: 120, 11: 120, 12: 120, 13: 120, 14: 120, 15: 120, 16: 120, 17: 120, 18: 120, 19: 120, 20: 120} 

              Distribution of patients among valid set of split 2 is 
 {1: 40, 2: 40,

# 4. Making custom train/test/valid dataloaders

Helpful links:

1. Very good thread from forum about LSTM input - https://discuss.pytorch.org/t/understanding-lstm-input/31110 . This link provides a code for implementing custom sequence length (window size).

2. Pytorch LSTM documentation - https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html .

In [42]:
print("PyTorch Version: ", torch.__version__)

PyTorch Version:  1.8.2


In [43]:
class CustomDataset(Dataset):
    def __init__(self, filenames: list, classes: dict, is_fixed: bool):
        self.filenames = filenames
        self.classes = classes
        self.is_fixed = is_fixed

    def __len__(self):
        return int(len(self.filenames))

    def __getitem__(self, idx):

        data = pd.read_csv(self.filenames[idx])

        if self.is_fixed == False:
            data = fix_df(data, zerolistmaker, MAX_LEN)
        data = torch.from_numpy(np.asarray(data)).float()

        # OLD VERSION --> Converting DataFrame file to LIST OF TENSORS
        # data = [torch.tensor(df_data)
        #         for (df_name, df_data) in data.iteritems()]

        splitted_name = self.filenames[idx].split('/')[2].split('_')
        class_label = splitted_name[0]

        label = self.classes[class_label]
        if idx == self.__len__():
            raise IndexError

        return data, label


Сheck dataset and dataloader

In [44]:
check_dataset = CustomDataset(filenames=iteration_split_2.train_set, classes=classes, is_fixed=True)
for i, (data, label) in enumerate(check_dataset):
    print(f"Our dataset returns {type(data)} and it's example is {data.shape}")
    print(f"Our label is {label} and it's type is {type(label)}")
    if i == 0: break

Our dataset returns <class 'torch.Tensor'> and it's example is torch.Size([750, 31])
Our label is 3 and it's type is <class 'int'>


In [45]:
dataloader = DataLoader(check_dataset, batch_size=1, shuffle=True)

for i, (data, label) in enumerate(dataloader):
    print(
        f"Our DataLoader returns {type(data)} and it's example is {data.shape}")
    print(f'Our DataLoader looks like {data}')
    print(f"Our label is {label} and it's type is {type(label)}")
    if i == 0:
        break


Our DataLoader returns <class 'torch.Tensor'> and it's example is torch.Size([1, 750, 31])
Our DataLoader looks like tensor([[[-4.3962e+01, -6.6454e+01,  7.0529e+01,  ...,  2.7046e+02,
           1.6381e+02,  6.6089e+01],
         [-4.9975e+01, -3.5423e+01,  6.1607e+01,  ...,  2.6968e+02,
           1.3404e+02,  5.8772e+01],
         [-2.9372e+01,  9.8600e-02,  5.3626e+01,  ...,  2.3977e+02,
           1.0061e+02,  4.5122e+01],
         ...,
         [ 4.0653e+01,  9.6157e+01,  2.7864e+00,  ..., -2.6335e+01,
          -3.0744e+01,  7.9611e+01],
         [ 3.3367e+01,  8.5261e+01,  3.8243e+00,  ..., -8.7090e+00,
          -2.1066e+01,  6.7197e+01],
         [ 2.9253e+01,  6.7275e+01,  1.5440e+01,  ...,  1.8715e+01,
           8.3794e+00,  4.8261e+01]]])
Our label is tensor([1]) and it's type is <class 'torch.Tensor'>


It this example:

1 - **Batch size**.

800 - **Sequence Length**.

31 - **Input size**.

# 5. LSTM pipeline

**Useful links**:

1. PyTorch LSTM implementation can be taken from: https://www.kaggle.com/code/purplejester/a-simple-lstm-based-time-series-classifier/notebook

* In their case - Input == 10, Output == 3.

* In my case - Input == 31, Output == 8.

2. Reading multiple CSV files in PyTorch: https://biswajitsahoo1111.github.io/post/reading-multiple-csv-files-in-pytorch/#sec_2


In [46]:
from torch import nn
from torch.nn import functional as F
from torchmetrics.classification import F1Score


class LSTMClassifier(nn.Module):
    """Very simple implementation of LSTM-based time-series classifier."""

    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.rnn = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.batch_size = None
        self.hidden = None

    def forward(self, x):
        h0, c0 = self.init_hidden(x)
        out, (hn, cn) = self.rnn(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

    def init_hidden(self, x):
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
        return [t.cuda() for t in (h0, c0)]


  warn(f"Failed to load image Python extension: {e}")


Making dataloaders for our pipeline

In [47]:
# Original
train_dl_original = DataLoader(CustomDataset(
    filenames=iteration_original.train_set, classes=classes, is_fixed=False), batch_size=20, shuffle=True)
val_dl_original = DataLoader(CustomDataset(
    filenames=iteration_original.valid_set, classes=classes, is_fixed=False), batch_size=20, shuffle=True)

# split_2
train_2 = DataLoader(CustomDataset(
    filenames=iteration_split_2.train_set, classes=classes, is_fixed=True), batch_size=2, shuffle=True)
val_2 = DataLoader(CustomDataset(
    filenames=iteration_split_2.valid_set, classes=classes, is_fixed=True), batch_size=2, shuffle=True)

# split_3
train_3 = DataLoader(CustomDataset(
    filenames=iteration_split_3.train_set, classes=classes, is_fixed=True), batch_size=20, shuffle=True)
val_3 = DataLoader(CustomDataset(
    filenames=iteration_split_3.valid_set, classes=classes, is_fixed=True), batch_size=20, shuffle=True)

# split_5
train_5 = DataLoader(CustomDataset(
    filenames=iteration_original.train_set, classes=classes, is_fixed=True), batch_size=20, shuffle=True)
val_5 = DataLoader(CustomDataset(
    filenames=iteration_original.valid_set, classes=classes, is_fixed=True), batch_size=20, shuffle=True)


In [51]:
input_dim = 31
hidden_dim = 256
layer_dim = 6      # Number of LSTM layers
output_dim = 8


lr = 0.05
n_epochs = 50
best_f1 = 0
patience, trials = 15, 0


In [52]:
model = LSTMClassifier(input_dim, hidden_dim, layer_dim, output_dim)
model = model.cuda()
# nn.CrossEntropyLoss is a loss function, that applies Softmax automatically
criterion = nn.CrossEntropyLoss()
opt = torch.optim.RMSprop(model.parameters(), lr=lr)


In [53]:
from torchinfo import summary

# This should help with VISUALIZING MODEL ARCHITECTURE

dev = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")

summary(LSTMClassifier(input_dim, hidden_dim, layer_dim, output_dim),
        input_size=(5, 1438, 31), col_names=["kernel_size", "input_size", "output_size", "num_params"], 
        verbose=2)


Layer (type:depth-idx)                   Kernel Shape              Input Shape               Output Shape              Param #
LSTMClassifier                           --                        [5, 1438, 31]             [5, 8]                    --
├─LSTM: 1-1                              --                        [5, 1438, 31]             [5, 1438, 256]            2,927,616
│    └─weight_ih_l0                      [1024, 31]                                                                    ├─31,744
│    └─weight_hh_l0                      [1024, 256]                                                                   ├─262,144
│    └─bias_ih_l0                        [1024]                                                                        ├─1,024
│    └─bias_hh_l0                        [1024]                                                                        ├─1,024
│    └─weight_ih_l1                      [1024, 256]                                                           

Layer (type:depth-idx)                   Kernel Shape              Input Shape               Output Shape              Param #
LSTMClassifier                           --                        [5, 1438, 31]             [5, 8]                    --
├─LSTM: 1-1                              --                        [5, 1438, 31]             [5, 1438, 256]            2,927,616
│    └─weight_ih_l0                      [1024, 31]                                                                    ├─31,744
│    └─weight_hh_l0                      [1024, 256]                                                                   ├─262,144
│    └─bias_ih_l0                        [1024]                                                                        ├─1,024
│    └─bias_hh_l0                        [1024]                                                                        ├─1,024
│    └─weight_ih_l1                      [1024, 256]                                                           

Training

In [None]:
from torch.utils.tensorboard import SummaryWriter

log_folder = 'runs/Jupyter_split_2'
writer = SummaryWriter(log_folder)
writer.add_text('TEXT', 'Start model training', 0)

In [None]:
from clearml import Task

# task = Task.init(project_name='Neuroscience_thesys',
#                  task_name='pytorch_training_0')

print('Start model training')

for epoch in range(1, n_epochs + 1):

    for i, (x_batch, y_batch) in enumerate(train_2):
        model.train()
        x_batch = x_batch.cuda()
        print(f'x_batch is {x_batch}')
        y_batch = y_batch.cuda()
        opt.zero_grad()
        # SOFTMAX is not needed, because
        # OUR LOSS Function applies it for us
        out = model(x_batch)
        print(f'model output is {out}')
        print(f'y_batch is {y_batch}')

        loss = criterion(out, y_batch)
        writer.add_scalar("Loss/train", loss, epoch)
        loss.backward()
        opt.step()
 
    model.eval()
    correct, total = 0, 0
    for x_val, y_val in val_2:
        x_val, y_val = [t.cuda() for t in (x_val, y_val)]
        # Forward pass
        out = model(x_val)

        # Use Softmax for classification
        preds = F.log_softmax(out, dim=1)
        preds = preds.argmax(dim=1)

        total += y_val.size(0)
        correct += (preds == y_val).sum().item()

    # ACTUALLY, IT'S A ACCURACY
    f1 = correct / total
    writer.add_scalar("Accuracy_score", f1, epoch)

    if epoch % 5 == 0:
        print(f'Epoch: {epoch:3d}. Loss: {loss.item():.4f}. f1.: {f1:2.2%}')

    if f1 > best_f1:
        trials = 0
        best_f1 = f1
        torch.save(model.state_dict(), 'best.pth')
        print(f'Epoch {epoch} best model saved with f1: {best_f1:2.2%}')
    else:
        trials += 1
        if trials >= patience:
            print(f'Early stopping on epoch {epoch}')
            break
print('The training is finished! Restoring the best model weights')


**Useful links**:

1. Accessing the network weights in Pytorch: https://stackoverflow.com/questions/56435961/how-to-access-the-network-weights-while-using-pytorch-nn-sequential

2. How To Print Model Architecture And Extract Model Weights in PyTorch: https://clay-atlas.com/us/blog/2021/07/29/pytorch-en-extract-model-layer-weights/