<center>
    <h1> CardioRisk: Unraveling Patterns for Early Diagnosis Using Machine Learning </h1>
    <h2> Converting .DATA files to readable .CSV files </h2>
    <h4> Akansha Malviya, Shruti Badrinarayanan, Poojitha Venkat Ram, Vani Kancherlapalli </h4>
</center>

In [1]:
import pandas as pd
import itertools
import csv

## Hungary

In [2]:
# Latin1 will allow to read even incorrect bytes
with open('Raw Data/Raw_Data_Hungary.data', encoding='Latin1') as fd:
    # read all lines into an array
    lines = [line.strip() for line in fd]

# "group" them by 10 and split while we have 76 fields 
data = itertools.takewhile(lambda x: len(x) == 76,
                           (' '.join(lines[i:i+10]).split()
                            for i in range(0, len(lines), 10)))

# build the dataframe from that
df_hungary = pd.DataFrame.from_records(data)
df_hungary.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,66,67,68,69,70,71,72,73,74,75
0,1254,0,40,1,1,0,0,-9,2,140,...,-9,-9,1,1,1,1,1,-9.0,-9.0,name
1,1255,0,49,0,1,0,0,-9,3,160,...,-9,-9,1,1,1,1,1,-9.0,-9.0,name
2,1256,0,37,1,1,0,0,-9,2,130,...,-9,-9,1,1,1,1,1,-9.0,-9.0,name
3,1257,0,48,0,1,1,1,-9,4,138,...,2,-9,1,1,1,1,1,-9.0,-9.0,name
4,1258,0,54,1,1,0,1,-9,3,150,...,1,-9,1,1,1,1,1,-9.0,-9.0,name


In [3]:
# Save to output to csvs, setting index to false will not write the index to file
df_hungary.to_csv("Processed CSVs/Processed_Data_Hungary.csv", index=False)

df1 = pd.read_csv("Processed CSVs/Processed_Data_Hungary.csv")
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,66,67,68,69,70,71,72,73,74,75
0,1254,0,40,1,1,0,0,-9,2,140,...,-9,-9,1,1,1,1,1,-9.0,-9.0,name
1,1255,0,49,0,1,0,0,-9,3,160,...,-9,-9,1,1,1,1,1,-9.0,-9.0,name
2,1256,0,37,1,1,0,0,-9,2,130,...,-9,-9,1,1,1,1,1,-9.0,-9.0,name
3,1257,0,48,0,1,1,1,-9,4,138,...,2,-9,1,1,1,1,1,-9.0,-9.0,name
4,1258,0,54,1,1,0,1,-9,3,150,...,1,-9,1,1,1,1,1,-9.0,-9.0,name


#### Adding Location column (adding the location "Hungary" to all the hungarian records)

In [4]:
# Define the constant value for the new column
constant_value = "Hungary"

# Open the existing CSV file for reading
with open('Processed CSVs/Processed_Data_Hungary.csv', 'r') as file:
    reader = csv.reader(file)
    data = list(reader)

# Add a new column header at the beginning of the header row
header = data[0]
header.insert(0, "Location")

# Add the constant value to each row in the new column
for i in range(1, len(data)):  # Skip the header row
    data[i].insert(0, constant_value)

# Write the modified data back to the CSV file
with open('Processed CSVs/Processed_Data_Hungary.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(data)

dfHungary = pd.read_csv("Processed CSVs/Processed_Data_Hungary.csv")
dfHungary.head()

Unnamed: 0,Location,0,1,2,3,4,5,6,7,8,...,66,67,68,69,70,71,72,73,74,75
0,Hungary,1254,0,40,1,1,0,0,-9,2,...,-9,-9,1,1,1,1,1,-9.0,-9.0,name
1,Hungary,1255,0,49,0,1,0,0,-9,3,...,-9,-9,1,1,1,1,1,-9.0,-9.0,name
2,Hungary,1256,0,37,1,1,0,0,-9,2,...,-9,-9,1,1,1,1,1,-9.0,-9.0,name
3,Hungary,1257,0,48,0,1,1,1,-9,4,...,2,-9,1,1,1,1,1,-9.0,-9.0,name
4,Hungary,1258,0,54,1,1,0,1,-9,3,...,1,-9,1,1,1,1,1,-9.0,-9.0,name


## Cleveland

In [5]:
# Latin1 will allow to read even incorrect bytes
with open('Raw Data/Raw_Data_Cleveland.data', encoding='Latin1') as fd:
    # read all lines into an array
    lines = [line.strip() for line in fd]

# "group" them by 10 and split while we have 76 fields 
data = itertools.takewhile(lambda x: len(x) == 76,
                           (' '.join(lines[i:i+10]).split()
                            for i in range(0, len(lines), 10)))

# build the dataframe from that
df_cleveland = pd.DataFrame.from_records(data)
df_cleveland.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,66,67,68,69,70,71,72,73,74,75
0,1,0,63,1,-9,-9,-9,-9,1,145,...,1,1,1,1,1,1,1,-9,-9,name
1,2,0,67,1,-9,-9,-9,-9,4,160,...,1,1,1,1,1,1,1,-9,-9,name
2,3,0,67,1,-9,-9,-9,-9,4,120,...,2,2,1,1,1,7,3,-9,-9,name
3,4,0,37,1,-9,-9,-9,-9,3,130,...,1,1,1,1,1,1,1,-9,-9,name
4,6,0,41,0,-9,-9,-9,-9,2,130,...,1,1,1,1,1,1,1,-9,-9,name


In [6]:
# Save to output to csvs, setting index to false will not write the index to file
df_cleveland.to_csv("Processed CSVs/Processed_Data_Cleveland.csv", index=False)

df1 = pd.read_csv("Processed CSVs/Processed_Data_Cleveland.csv")
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,66,67,68,69,70,71,72,73,74,75
0,1,0,63,1,-9,-9,-9,-9,1,145,...,1,1,1,1,1,1,1,-9,-9,name
1,2,0,67,1,-9,-9,-9,-9,4,160,...,1,1,1,1,1,1,1,-9,-9,name
2,3,0,67,1,-9,-9,-9,-9,4,120,...,2,2,1,1,1,7,3,-9,-9,name
3,4,0,37,1,-9,-9,-9,-9,3,130,...,1,1,1,1,1,1,1,-9,-9,name
4,6,0,41,0,-9,-9,-9,-9,2,130,...,1,1,1,1,1,1,1,-9,-9,name


#### Adding Location column (adding the location "Cleveland" to all the cleveland records)

In [7]:
# Define the constant value for the new column
constant_value = "Cleveland"

# Open the existing CSV file for reading
with open('Processed CSVs/Processed_Data_Cleveland.csv', 'r') as file:
    reader = csv.reader(file)
    data = list(reader)

# Add a new column header at the beginning of the header row
header = data[0]
header.insert(0, "Location")

# Add the constant value to each row in the new column
for i in range(1, len(data)):  # Skip the header row
    data[i].insert(0, constant_value)

# Write the modified data back to the CSV file
with open('Processed CSVs/Processed_Data_Cleveland.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(data)

dfCleveland = pd.read_csv("Processed CSVs/Processed_Data_Cleveland.csv")
dfCleveland.head()

Unnamed: 0,Location,0,1,2,3,4,5,6,7,8,...,66,67,68,69,70,71,72,73,74,75
0,Cleveland,1,0,63,1,-9,-9,-9,-9,1,...,1,1,1,1,1,1,1,-9,-9,name
1,Cleveland,2,0,67,1,-9,-9,-9,-9,4,...,1,1,1,1,1,1,1,-9,-9,name
2,Cleveland,3,0,67,1,-9,-9,-9,-9,4,...,2,2,1,1,1,7,3,-9,-9,name
3,Cleveland,4,0,37,1,-9,-9,-9,-9,3,...,1,1,1,1,1,1,1,-9,-9,name
4,Cleveland,6,0,41,0,-9,-9,-9,-9,2,...,1,1,1,1,1,1,1,-9,-9,name


## Long Beach VA

In [8]:
# Latin1 will allow to read even incorrect bytes
with open('Raw Data/Raw_Data_LongBeachVA.data', encoding='Latin1') as fd:
    # read all lines into an array
    lines = [line.strip() for line in fd]

# "group" them by 10 and split while we have 76 fields 
data = itertools.takewhile(lambda x: len(x) == 76,
                           (' '.join(lines[i:i+10]).split()
                            for i in range(0, len(lines), 10)))

# build the dataframe from that
df_longbeachva = pd.DataFrame.from_records(data)
df_longbeachva.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,66,67,68,69,70,71,72,73,74,75
0,1,0,63,1,1,1,1,-9,4,140,...,2,1,1,1,1,1,1,0.7,5.5,name
1,2,0,44,1,1,1,1,-9,4,130,...,1,1,1,1,1,1,1,0.5,-9.0,name
2,3,0,60,1,1,1,1,-9,4,132,...,2,1,1,1,1,7,2,0.52,4.1,name
3,4,0,55,1,1,1,1,-9,4,142,...,1,1,1,1,1,1,1,0.73,6.5,name
4,5,0,66,1,1,0,0,-9,3,110,...,1,1,1,1,1,1,1,0.73,8.0,name


In [9]:
# Save to output to csvs, setting index to false will not write the index to file
df_longbeachva.to_csv("Processed CSVs/Processed_Data_LongBeachVA.csv", index=False)

df1 = pd.read_csv("Processed CSVs/Processed_Data_LongBeachVA.csv")
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,66,67,68,69,70,71,72,73,74,75
0,1,0,63,1,1,1,1,-9,4,140,...,2,1,1,1,1,1,1,0.7,5.5,name
1,2,0,44,1,1,1,1,-9,4,130,...,1,1,1,1,1,1,1,0.5,-9.0,name
2,3,0,60,1,1,1,1,-9,4,132,...,2,1,1,1,1,7,2,0.52,4.1,name
3,4,0,55,1,1,1,1,-9,4,142,...,1,1,1,1,1,1,1,0.73,6.5,name
4,5,0,66,1,1,0,0,-9,3,110,...,1,1,1,1,1,1,1,0.73,8.0,name


#### Adding Location column (adding the location "LongBeachVA" to all the long beach records)

In [10]:
# Define the constant value for the new column
constant_value = "LongBeachVA"

# Open the existing CSV file for reading
with open('Processed CSVs/Processed_Data_LongBeachVA.csv', 'r') as file:
    reader = csv.reader(file)
    data = list(reader)

# Add a new column header at the beginning of the header row
header = data[0]
header.insert(0, "Location")

# Add the constant value to each row in the new column
for i in range(1, len(data)):  # Skip the header row
    data[i].insert(0, constant_value)

# Write the modified data back to the CSV file
with open('Processed CSVs/Processed_Data_LongBeachVA.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(data)

dfLongBeachVA = pd.read_csv("Processed CSVs/Processed_Data_LongBeachVA.csv")
dfLongBeachVA.head()

Unnamed: 0,Location,0,1,2,3,4,5,6,7,8,...,66,67,68,69,70,71,72,73,74,75
0,LongBeachVA,1,0,63,1,1,1,1,-9,4,...,2,1,1,1,1,1,1,0.7,5.5,name
1,LongBeachVA,2,0,44,1,1,1,1,-9,4,...,1,1,1,1,1,1,1,0.5,-9.0,name
2,LongBeachVA,3,0,60,1,1,1,1,-9,4,...,2,1,1,1,1,7,2,0.52,4.1,name
3,LongBeachVA,4,0,55,1,1,1,1,-9,4,...,1,1,1,1,1,1,1,0.73,6.5,name
4,LongBeachVA,5,0,66,1,1,0,0,-9,3,...,1,1,1,1,1,1,1,0.73,8.0,name


## Switzerland

In [11]:
# Latin1 will allow to read even incorrect bytes
with open('Raw Data/Raw_Data_Switzerland.data', encoding='Latin1') as fd:
    # read all lines into an array
    lines = [line.strip() for line in fd]

# "group" them by 10 and split while we have 76 fields 
data = itertools.takewhile(lambda x: len(x) == 76,
                           (' '.join(lines[i:i+10]).split()
                            for i in range(0, len(lines), 10)))

# build the dataframe from that
df_switzerland = pd.DataFrame.from_records(data)
df_switzerland.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,66,67,68,69,70,71,72,73,74,75
0,3001,0,65,1,1,1,1,-9,4,115,...,1,1,1,1,1,1,1,75,-9.0,name
1,3002,0,32,1,0,0,0,-9,1,95,...,1,1,1,1,1,5,1,63,-9.0,name
2,3003,0,61,1,1,1,1,-9,4,105,...,2,1,1,1,1,1,1,67,-9.0,name
3,3004,0,50,1,1,1,1,-9,4,145,...,1,1,1,1,1,5,4,36,-9.0,name
4,3005,0,57,1,1,1,1,-9,4,110,...,2,1,1,1,1,1,1,60,-9.0,name


In [12]:
# Save to output to csvs, setting index to false will not write the index to file
df_switzerland.to_csv("Processed CSVs/Processed_Data_Switzerland.csv", index=False)

df1 = pd.read_csv("Processed CSVs/Processed_Data_Switzerland.csv")
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,66,67,68,69,70,71,72,73,74,75
0,3001,0,65,1,1,1,1,-9,4,115,...,1,1,1,1,1,1,1,75.0,-9.0,name
1,3002,0,32,1,0,0,0,-9,1,95,...,1,1,1,1,1,5,1,63.0,-9.0,name
2,3003,0,61,1,1,1,1,-9,4,105,...,2,1,1,1,1,1,1,67.0,-9.0,name
3,3004,0,50,1,1,1,1,-9,4,145,...,1,1,1,1,1,5,4,36.0,-9.0,name
4,3005,0,57,1,1,1,1,-9,4,110,...,2,1,1,1,1,1,1,60.0,-9.0,name


#### Adding Location column (adding the location "Switzerland" to all the switzerland records)

In [13]:
# Define the constant value for the new column
constant_value = "Switzerland"

# Open the existing CSV file for reading
with open('Processed CSVs/Processed_Data_Switzerland.csv', 'r') as file:
    reader = csv.reader(file)
    data = list(reader)

# Add a new column header at the beginning of the header row
header = data[0]
header.insert(0, "Location")

# Add the constant value to each row in the new column
for i in range(1, len(data)):  # Skip the header row
    data[i].insert(0, constant_value)

# Write the modified data back to the CSV file
with open('Processed CSVs/Processed_Data_Switzerland.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(data)

dfSwitzerland = pd.read_csv("Processed CSVs/Processed_Data_Switzerland.csv")
dfSwitzerland.head()

Unnamed: 0,Location,0,1,2,3,4,5,6,7,8,...,66,67,68,69,70,71,72,73,74,75
0,Switzerland,3001,0,65,1,1,1,1,-9,4,...,1,1,1,1,1,1,1,75.0,-9.0,name
1,Switzerland,3002,0,32,1,0,0,0,-9,1,...,1,1,1,1,1,5,1,63.0,-9.0,name
2,Switzerland,3003,0,61,1,1,1,1,-9,4,...,2,1,1,1,1,1,1,67.0,-9.0,name
3,Switzerland,3004,0,50,1,1,1,1,-9,4,...,1,1,1,1,1,5,4,36.0,-9.0,name
4,Switzerland,3005,0,57,1,1,1,1,-9,4,...,2,1,1,1,1,1,1,60.0,-9.0,name
