In [50]:
import numpy as np 
import pandas as pd

years_list = np.arange(2015,2101).tolist()

# checking the list is correct 
# for x in range (len(years_list)):
#     print(years_list[x])
    

for i in range(len(years_list)):
    years_list[i] = "data/AIS-86Year/Raw/vars-" +str(years_list[i]) +".txt"

precipitation = pd.read_csv('data/AIS_data/precipitation-0.txt', delimiter='\t', names=["x-axis", "y-axis", "precipitation"])

def create_df(filepath, delimiter, original_column_names):
    df = pd.read_csv(filepath, delimiter = delimiter, names=original_column_names)
    return df

def remove_outliers(df):
    attributes = df.columns  # List of all column names in the DataFrame
    n_attributes = len(attributes)
    for i, attribute in enumerate(attributes):
        # Removing outliers using IQR
        Q1 = df[attribute].quantile(0.25)
        Q3 = df[attribute].quantile(0.75)
        IQR = Q3 - Q1  
        #I used a factor of 1.5 - just because it seems a common choice 
        filtered_df = df[(df[attribute] >= (Q1 - 1.5 * IQR)) & (df[attribute] <= (Q3 + 1.5 * IQR))]
    return filtered_df

def remove_rows(df):
    #this removes rows which have fill values of 9.96920996839e+36 for the column ocean_temp
    fill_value = 9.969210e+36
    tolerance = 1e-1  # Adjust the tolerance as needed
    # Use numpy isclose to compare with a tolerance
    mask = np.isclose(df['ocean_temp'], fill_value, atol=tolerance)
    # Invert the mask to filter out the rows with the fill value
    df = df[~mask]
    return df


def preprocess_individual_timeslice(df):
    remove_outliers(df)
    remove_rows(df)
    return df
    

def process_individual_files(file_list, original_column_names):
    processed_dfs = []
    for file in file_list:
        df = create_df(file, '\t', original_column_names)
        processed_df = preprocess_individual_timeslice(df)
        processed_dfs.append(processed_df)
    return processed_dfs

original_column_names = ['x-axis', 'y-axis', 'ice_thickness', 'ice_velocity', 
                'ice_mask', 'precipitation', 'air_temp', 'ocean_temp']

processed_dataframes = process_individual_files(years_list, original_column_names)


processed_dataframes[0].head()

Unnamed: 0,x-axis,y-axis,ice_thickness,ice_velocity,ice_mask,precipitation,air_temp,ocean_temp
0,-3040000,3040000,0.0,,4.0,729.484314,292.419525,9.969210000000001e+36
1,-2918400,3040000,0.0,,4.0,694.645996,273.800354,9.969210000000001e+36
2,-2796800,3040000,0.0,,4.0,691.841736,273.366791,9.969210000000001e+36
3,-2675200,3040000,0.0,,4.0,679.350647,272.888672,9.969210000000001e+36
4,-2553600,3040000,0.0,,4.0,659.84552,272.373596,9.969210000000001e+36
