# Data Preprocessing & Sampling - Unhealthy Patients

# Importing Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pandas import ExcelWriter
import os
import numpy as np
import random
plt.style.use("seaborn")

# Defining Suporting functions

In [None]:
def read_clean_4(filename):
    """Read the raw data and return the preprocessed data in the form of pandas dataframe"""
    data = pd.read_csv('D:/DATA SCIENCE/INTERNSHIP PROJECT/Unhealthy/' + filename, header = None)
    data[['Time', "Channel1"]] = data[0].str.split('\t',1,expand=True)
    data[['Channel1', "Channel2"]] = data["Channel1"].str.split('\t',1,expand=True)
    data[['Channel2', "Channel3"]] = data["Channel2"].str.split('\t',1,expand=True)
    data[['Channel3', "Channel4"]] = data["Channel3"].str.split('\t',1,expand=True)
    data.drop(0, axis=1, inplace=True)
    return data

In [None]:
def split(data):
    """It further splits the data returned by read_clean function and returns data ready for plotting"""
    data_filtered = data[data['Time'] == "Interval="]
    data_list = []
    for i in range(1, len(data_filtered)):
        data_list.append(data.iloc[data_filtered.index[i-1]:data_filtered.index[i],])
    data_list.append(data.iloc[data_filtered.index[-1]:,])
    return data_list

In [None]:
def stats(data, filename):
        """Takes swallow sample data and filename as input parameters
    & Evaluates features/Independent variables from swallow sample"""
    description = data.describe()
    sensor1_max = description.loc["max"]["Sensor1"]
    sensor2_max = description.loc["max"]["Sensor2"]
    sensor3_min = description.loc["min"]["Sensor3"]
    df_filtered1 = data[data['Sensor1'] == sensor1_max]
    df_filtered2 = data[data['Sensor2'] == sensor2_max]
    sensor1_maxtime = df_filtered1.index.values[0]
    sensor2_maxtime = df_filtered2.index.values[0]
    latency = float(sensor2_maxtime) - float(sensor1_maxtime)
    return [filename, sensor1_max, sensor2_max, sensor3_min, latency]

In [None]:
def filenames(path):
    """Takes directory as argument and Returns all the filenames in a directory in the form of list """
    filenames = []
    for file in os.listdir(path):
        filename = os.fsdecode(file)
        filenames.append(filename)
    return filenames

In [None]:
def generate_sample(filename,data):
    """Takes filename and data as arguments, 
       return the extracted sample in the form of dataframe and plot, 
       ask for user input to save or reject the sample,
       also returns the features of a sample
       """
    data1 = data.loc[data["Sensor1"]>80]
    points = list(data1.index)
    max_points = []
    for i in points:
        i = round(i,3)
        try:
            if data.loc[i,"Sensor1"] > data.loc[round(i+0.001,3), "Sensor1"] and data.loc[i,"Sensor1"] > data.loc[round(i-0.001,3), "Sensor1"]:
                max_points.append(i)
        except:
            pass
    if len(max_points) != 0:
        time = random.choice(max_points)
        time = round(time,3)
        if time-1 < 0:     
            array = np.arange(time,0,-0.001)
        else:
            array = np.arange(time,time-1,-0.001)
        list1 = []
        for element in array:
            list1.append(data.loc[round(element,3),"Sensor3"])
        start = round(array[list1.index(max(list1))],3)
        array = np.arange(time,time+1,0.001)
        list2 = []
        for element in array:
            list2.append(data.loc[round(element,3),"Sensor3"])
        end = round(array[list2.index(max(list2))],3)
        data2 = data.loc[start-1:end+1,]
        data2["Time"] = data2.index
        stat = stats(data2, filename)
        ax = data2.plot.line(x='Time', y=["Sensor1", "Sensor2", "Sensor3"],figsize = (15, 8 ))
        b = data2[data2["Sensor1"] == stat[1]]
        c = data2[data2["Sensor2"] == stat[2]]
        d = data2[data2["Sensor3"] == stat[3]]
        fro = data2[data2["Sensor3"] == data2.loc[start, "Sensor3"]]
        to = data2[data2["Sensor3"] == data2.loc[end, "Sensor3"]]
        b.plot.scatter(x="Time", y="Sensor1", ax=ax,color="r", marker="o",s=50)
        c.plot.scatter(x="Time", y="Sensor2", ax=ax,color="r", marker="o",s=50)
        d.plot.scatter(x="Time", y="Sensor3", ax=ax,color="r", marker="o",s=50)
        fro.plot.scatter(x="Time", y="Sensor3", ax=ax,color="b", marker="o",s=50)
        to.plot.scatter(x="Time", y="Sensor3", ax=ax,color="b", marker="o",s=50)
        plt.show()
        stat.append(start)
        stat.append(end)
        stat.append(end - start)
        return [filename,data2,stat]
    else:
        return 0

In [None]:
def save_sample(sample_no, sample, Parameters):
    """takes sample number, sample data & sample features as parameter,
       ask for user input to save or reject the sample,
       returns the pandas dataframe containing features of all saved samples
    """
    save = input("Enter Y/N")
    if save == "Y" :
        writer = ExcelWriter('D:/DATA SCIENCE/INTERNSHIP PROJECT/Unhealthy/Samples/' + sample[0][:-4] + "/" + sample[0][:-4] + "_" + str(sample_no) + ".xlsx")
        sample[1].to_excel(writer,'Sheet1')
        writer.save()       
        Parameters.loc[sample_no] = sample[2]
    else:
        pass
    return Parameters

## Main

In [None]:
directory = "D:/DATA SCIENCE/INTERNSHIP PROJECT/Unhealthy/"
files = filenames(directory)
Parameters = pd.DataFrame(columns=["Patient",'Sensor1_max', 'Sensor2_max', 'Sensor3_min', "Latency", "UES_start", "UES_end","UES_Duration"])
sample_no = 0

In [None]:
for file in files:

    if not os.path.exists("D:/DATA SCIENCE/INTERNSHIP PROJECT/Unhealthy/Samples/" + file[:-4]):
        os.makedirs("D:/DATA SCIENCE/INTERNSHIP PROJECT/Unhealthy/Samples/" + file[:-4])
    if len(os.listdir("D:/DATA SCIENCE/INTERNSHIP PROJECT/Unhealthy/Samples/" + file[:-4]))!= 5:
        data = read_clean_4(file)
        data_list = split(data)
    while len(os.listdir("D:/DATA SCIENCE/INTERNSHIP PROJECT/Unhealthy/Samples/" + file[:-4])) < 5:  
        data = random.choice(data_list)[9:]
        data["S.No"] = data.index
        data.Time = pd.to_numeric(data.Time, errors='coerce')
        data.Channel1 = pd.to_numeric(data.Channel1, errors='coerce')
        data.Channel2 = pd.to_numeric(data.Channel2, errors='coerce')
        data.Channel3 = pd.to_numeric(data.Channel3, errors='coerce')
        data.index = data.Time
        cols = ["S.No", "Channel1", "Channel2", "Channel3"]
        data = data[cols]
        data.columns = ["S.No", "Sensor1", "Sensor2", "Sensor3"]
        data.index = data.index.to_series().apply(lambda x: np.round(x,3))
        sample = generate_sample(file,data)
        my_rounded_list = [ round(elem, 0) for elem in list(Parameters.UES_start) ]
        if sample != 0:
            if round(sample[2][5],0) not in my_rounded_list[-5:]:
                Parameters = save_sample(sample_no, sample, Parameters)
                sample_no = sample_no + 1
        else:
            pass

In [None]:
writer = ExcelWriter('D:/DATA SCIENCE/INTERNSHIP PROJECT/Unhealthy/Samples/Parameters.xlsx')
Parameters.to_excel(writer,'Sheet1')
writer.save()