# Data Ingestion: Hard Drive

##### Initial Setup

In [6]:
#Import python modules
import boto3
import configparser
import json
import pandas as pd
import random
import time

In [7]:
#Load configurations 
#Ask Riu for access key information and update config file before running
%run load_config.py

In [10]:
#Define firehose object
firehose = boto3.client(
    'firehose',
    region_name = "us-east-2",
    aws_access_key_id = askRiu,
    aws_secret_access_key = askRiu)

##### Prepare Data for Streaming

In [24]:
#Read data
r_path = "../../HardDrives/"
csv_name = "df_analysis.csv"

df = pd.read_csv(r_path + csv_name, low_memory = False)
df = df.drop(columns="Unnamed: 0")
df

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_3_normalized,smart_3_raw,smart_4_normalized,...,smart_197_raw,smart_198_normalized,smart_198_raw,smart_199_normalized,smart_199_raw,smart_240_raw,smart_241_raw,smart_242_raw,end_date,useful_life
0,2016-01-01,Z300ZST1,ST4000DM000,1.976651e-311,0,120,240872800,91,0,100,...,0,100,0,200,0,8.206430e-320,1.146406e-313,9.784107e-313,2016-01-17,16 days
1,2016-01-01,9VY8TCAV,ST3160318AS,7.907120e-313,0,111,38605541,97,0,100,...,0,100,0,200,0,1.350099e-309,2.004170e-314,2.076417e-314,2016-04-09,99 days
2,2016-01-01,W300BG0H,ST4000DM000,1.976651e-311,0,118,190147064,91,0,100,...,0,100,0,200,0,7.857538e-310,1.249659e-313,6.394443e-313,2016-04-18,108 days
3,2016-01-01,W300J6FX,ST4000DM000,1.976651e-311,0,115,85258376,92,0,100,...,0,100,0,200,0,1.000186e-319,1.324438e-313,6.040397e-313,2016-04-09,99 days
4,2016-01-01,Z3014A4S,ST4000DM000,1.976651e-311,0,116,101976792,91,0,100,...,8,100,8,200,0,8.351192e-320,1.063035e-313,1.019657e-312,2016-01-10,9 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5485,2016-04-28,W0Q6KWV8,ST320LT007,1.581370e-312,1,117,147958432,98,0,100,...,0,100,0,200,0,1.938655e-310,4.047112e-315,3.733211e-315,2016-04-28,0 days
5486,2016-04-28,S300XP36,ST4000DM000,1.976651e-311,1,117,146527232,98,0,100,...,0,100,0,200,0,4.058255e-320,1.360172e-313,7.902839e-314,2016-04-28,0 days
5487,2016-04-28,W300T2RC,ST4000DM000,1.976651e-311,1,117,144495808,91,0,100,...,24,100,24,200,0,9.588826e-320,8.115515e-314,5.084032e-313,2016-04-28,0 days
5488,2016-04-29,Z302A13D,ST4000DM000,1.976651e-311,1,119,223384952,97,0,100,...,8,100,8,200,0,5.221780e-320,1.574255e-313,1.112712e-313,2016-04-29,0 days


In [42]:
#Convert specified columns from int64 to string
df["failure"] = df["failure"].astype(str)
df["smart_1_normalized"] = df["smart_1_normalized"].astype(str)
df["smart_1_raw"] = df["smart_1_raw"].astype(str)
df["smart_3_normalized"] = df["smart_3_normalized"].astype(str)
df["smart_3_raw"] = df["smart_3_raw"].astype(str)
df["smart_4_normalized"] = df["smart_4_normalized"].astype(str)
df["smart_4_raw"] = df["smart_4_raw"].astype(str)
df["smart_5_normalized"] = df["smart_5_normalized"].astype(str)
df["smart_5_raw"] = df["smart_5_raw"].astype(str)
df["smart_7_normalized"] = df["smart_7_normalized"].astype(str)
df["smart_9_normalized"] = df["smart_9_normalized"].astype(str)
df["smart_9_raw"] = df["smart_9_raw"].astype(str)
df["smart_10_normalized"] = df["smart_10_normalized"].astype(str)
df["smart_10_raw"] = df["smart_10_raw"].astype(str)
df["smart_12_normalized"] = df["smart_12_normalized"].astype(str)
df["smart_12_raw"] = df["smart_12_raw"].astype(str)
df["smart_197_normalized"] = df["smart_197_normalized"].astype(str)
df["smart_197_raw"] = df["smart_197_raw"].astype(str)
df["smart_198_normalized"] = df["smart_198_normalized"].astype(str)
df["smart_198_raw"] = df["smart_198_raw"].astype(str)
df["smart_199_normalized"] = df["smart_199_normalized"].astype(str)
df["smart_199_raw"] = df["smart_199_raw"].astype(str)

In [43]:
#Define number of rows in data
nrow = df.shape[0]

In [44]:
column_names = ['date', 'serial_number', 'model', 'capacity_bytes', 'failure',
                'smart_1_normalized', 'smart_1_raw', 'smart_3_normalized', 'smart_3_raw',
                'smart_4_normalized', 'smart_4_raw', 'smart_5_normalized','smart_5_raw', 
                'smart_7_normalized', 'smart_7_raw','smart_9_normalized','smart_9_raw', 
                'smart_10_normalized', 'smart_10_raw','smart_12_normalized','smart_12_raw', 
                'smart_188_raw','smart_194_normalized', 'smart_194_raw','smart_197_normalized', 
                'smart_197_raw','smart_198_normalized', 'smart_198_raw', 'smart_199_normalized',
                'smart_199_raw', 'smart_240_raw','smart_241_raw','smart_242_raw',
                'end_date','useful_life']

In [45]:
#Simulate data streaming
for i in range(nrow):
    response = firehose.put_record(
        DeliveryStreamName = DeliveryStreamName,
        Record = {
            "Data": json.dumps(
                {
                    f"{column_names[0]}": df.at[df.index[i], column_names[0]],
                    f"{column_names[1]}": df.at[df.index[i], column_names[1]],
                    f"{column_names[2]}": df.at[df.index[i], column_names[2]],
                    f"{column_names[3]}": df.at[df.index[i], column_names[3]],
                    f"{column_names[4]}": df.at[df.index[i], column_names[4]],
                    f"{column_names[5]}": df.at[df.index[i], column_names[5]],
                    f"{column_names[6]}": df.at[df.index[i], column_names[6]],
                    f"{column_names[7]}": df.at[df.index[i], column_names[7]],
                    f"{column_names[8]}": df.at[df.index[i], column_names[8]],
                    f"{column_names[9]}": df.at[df.index[i], column_names[9]],
                    f"{column_names[10]}": df.at[df.index[i], column_names[10]],
                    f"{column_names[11]}": df.at[df.index[i], column_names[11]],
                    f"{column_names[12]}": df.at[df.index[i], column_names[12]],
                    f"{column_names[13]}": df.at[df.index[i], column_names[13]],
                    f"{column_names[14]}": df.at[df.index[i], column_names[14]],
                    f"{column_names[15]}": df.at[df.index[i], column_names[15]],
                    f"{column_names[16]}": df.at[df.index[i], column_names[16]],
                    f"{column_names[17]}": df.at[df.index[i], column_names[17]],
                    f"{column_names[18]}": df.at[df.index[i], column_names[18]],
                    f"{column_names[19]}": df.at[df.index[i], column_names[19]],
                    f"{column_names[20]}": df.at[df.index[i], column_names[20]],
                    f"{column_names[21]}": df.at[df.index[i], column_names[21]],
                    f"{column_names[22]}": df.at[df.index[i], column_names[22]],
                    f"{column_names[23]}": df.at[df.index[i], column_names[23]],
                    f"{column_names[24]}": df.at[df.index[i], column_names[24]],
                    f"{column_names[25]}": df.at[df.index[i], column_names[25]],
                    f"{column_names[26]}": df.at[df.index[i], column_names[26]],
                    f"{column_names[27]}": df.at[df.index[i], column_names[27]],
                    f"{column_names[28]}": df.at[df.index[i], column_names[28]],
                    f"{column_names[29]}": df.at[df.index[i], column_names[29]],
                    f"{column_names[30]}": df.at[df.index[i], column_names[30]],
                    f"{column_names[31]}": df.at[df.index[i], column_names[31]],
                    f"{column_names[32]}": df.at[df.index[i], column_names[32]],
                    f"{column_names[33]}": df.at[df.index[i], column_names[33]],
                    f"{column_names[34]}": df.at[df.index[i], column_names[34]]
                }
            )
        }
    )
    time.sleep(round(random.uniform(0.01, 0.15), 3))