### boto3 呼叫 aws bucket

參考資料：

https://dev.to/gbhorwood/managing-your-aws-resources-with-python-and-boto3-1ceb

https://dev.to/aws-builders/how-to-list-contents-of-s3-bucket-using-boto3-python-47mm

https://boto3.amazonaws.com/v1/documentation/api/latest/guide/session.html

https://www.learnaws.org/2022/09/27/pandas-read-s3/

https://aws-sdk-pandas.readthedocs.io/en/stable/tutorials/002%20-%20Sessions.html

In [27]:


import sys
import boto3

# configuration

# --> NEW
# the name of the aws profile
aws_profile = "house-good-job_john"

# --> NEW
# aws credentials read in from ~/.aws/credentials
aws_access_key_id = None
aws_secret_access_key = None
aws_region = "ap-northeast-1"


# --> NEW
def get_aws_credentials():
    """
    Gets the aws credentials from ~/.aws/credentials
    for the aws profile name
    """
    import configparser
    import os

    # the aws profile we configured
    global aws_profile

    # the global variables where we store the aws credentials
    global aws_access_key_id
    global aws_secret_access_key

    # parse the aws credentials file
    path = os.environ['HOME'] + '/.aws/credentials'
    config = configparser.ConfigParser()
    config.read(path)

    # read in the aws_access_key_id and the aws_secret_access_key
    # if the profile does not exist, error and exit
    if aws_profile in config.sections():
        aws_access_key_id = config[aws_profile]['aws_access_key_id']
        aws_secret_access_key = config[aws_profile]['aws_secret_access_key']
    else:
        print("Cannot find profile '{}' in {}".format(aws_profile, path), True)
        sys.exit()

    # if we don't have both the access and secret key, error and exit
    if aws_access_key_id is None or aws_secret_access_key is None:
        print("AWS config values not set in '{}' in {}".format(aws_profile, path), True)
        sys.exit()



In [2]:
# aws_crentials confirmation

# get_aws_credentials()
# print(aws_profile)
# print(aws_access_key_id)
# print(aws_secret_access_key)

In [33]:
import boto3

aws_profile = "house-good-job_john"
aws_access_key_id = None
aws_secret_access_key = None

session = boto3.Session(profile_name=aws_profile)
s3_client = session.client('s3')

objects = s3_client.list_objects_v2(Bucket='house-good-job')

for obj in objects['Contents']:
    print(obj['Key'])

clean_data/
clean_data/KEL_model_features_clean.csv
clean_data/NTPC_model_features_clean.csv
clean_data/TPE_model_features_clean.csv


In [37]:
import boto3


aws_profile = "stock-tw_john"
aws_access_key_id = None
aws_secret_access_key = None

session1 = boto3.Session(profile_name=aws_profile)
s3_client1 = session1.client('s3')

objects1 = s3_client1.list_objects_v2(Bucket='stock-tw')

for obj in objects1['Contents']:
    print(obj['Key'])

KeyError: 'Contents'

In [38]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
import joblib, os
import matplotlib.pyplot as plt
import seaborn as sns
import awswrangler as wr
import boto3

class HousePriceModel_aws:

    def __init__(self, c):
        self.cityname = c
        self.modelpath = f'./{self.cityname}/model_mlp_aws.pkl'
        aws_profile = "house-good-job_john"
        session = boto3.Session(profile_name=aws_profile)
        data = wr.s3.read_csv(f's3://house-good-job/clean_data/{self.cityname}_model_features_clean.csv', boto3_session=session)
        # data=pd.read_csv(f'./{self.cityname}_model_features_clean.csv')
        # 單熱編碼
        data_class = pd.get_dummies(data['鄉鎮市區'])
        data_class.columns = ['鄉鎮市區_' + str(x) for x in data_class.columns]
        data = pd.concat([data, data_class], axis = 1)

        # 刪除資料分類用欄位
        data.insert(data.shape[1], 'y', data['單價元平方公尺'])
        data.drop(['單價元平方公尺', 'Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.2', 
        '鄉鎮市區', 'geometry'],axis=1,inplace=True)

        # 低變異過濾
        data = data.loc[:, data.std() > 0]
        data = data.dropna()
        self.feature_count = data.shape[1]

        # 訓練集 & 測試集
        test_data = data.loc[data['交易年份'] >= 111]
        test_data.to_csv(f'./{self.cityname}/test_data.csv')
        train_data =  data.loc[data['交易年份'] < 111]
        
        # 資料標準化
        self.mean = train_data.mean()
        self.std = train_data.std()
        self.train_data = (train_data-self.mean)/self.std

        
    def trainModel(self):
        X_train = np.array(self.train_data.drop('y', axis='columns'))
        y_train = np.array(self.train_data['y'])

        model_mlp = MLPRegressor(random_state=14,max_iter = 400,activation='relu', hidden_layer_sizes=(int(self.feature_count*1/2),int(self.feature_count*1/4)), verbose=True)
        model_mlp.fit(X_train, y_train)
        mlp_score=model_mlp.score(X_train,y_train)

        joblib.dump(model_mlp, f'./{self.cityname}/model_mlp_aws.pkl')
        print(f'model score: {mlp_score}')

    def testModel(self, testfile):
        if os.path.isfile(self.modelpath):
            test_data = pd.read_csv(f'./{self.cityname}/{testfile}.csv')
            test_data.drop(['Unnamed: 0'],axis=1,inplace=True)
            
            test_data = (test_data - self.mean) / self.std
            X_test = np.array(test_data.drop('y', axis='columns'))
            y_test = np.array(test_data['y'])

            
            model_mlp = joblib.load(f'./{self.cityname}/model_mlp_aws.pkl')
            result = model_mlp.predict(X_test)
            fig = plt.figure(figsize=(10,5))
            residuals = (y_test * self.std['y'] + self.mean['y'])- (result * self.std['y'] + self.mean['y'])
            pd.DataFrame(residuals).to_csv(f'./{self.cityname}/residuals.csv')

            data1 = pd.DataFrame({'origin':y_test * self.std['y'] + self.mean['y'],'predict':result* self.std['y'] + self.mean['y'],
                                'residual':(y_test * self.std['y'] + self.mean['y']) - (result* self.std['y'] + self.mean['y'])})
            percentage_error = np.mean(np.abs(data1['origin'] - data1['predict'])) / np.mean(data1['origin']) * 100
            data1['residual_abs'] = data1['residual'].abs()
            data1['y10'] = data1['origin'] / 10 - data1['residual_abs']
            data1['y20'] = data1['origin'] / 5 - data1['residual_abs']
            data1['y30'] = data1['origin'] / 3.333 - data1['residual_abs']
            data1.loc[data1['y10'] >= 0, 'y10'] = 1
            data1.loc[data1['y10'] < 0 , 'y10'] = 0
            data1.loc[data1['y20'] >= 0, 'y20'] = 1
            data1.loc[data1['y20'] < 0 , 'y20'] = 0
            data1.loc[data1['y30'] >= 0, 'y30'] = 1
            data1.loc[data1['y30'] < 0 , 'y30'] = 0
            

            print(f'=========={self.cityname}==========')
            print(f'預測房價落在實際房價+-10%內的機率為:{round(data1["y10"].mean(),4)*100}%')
            print(f'預測房價落在實際房價+-20%內的機率為:{round(data1["y20"].mean(),4)*100}%')
            print(f'預測房價落在實際房價+-30%內的機率為:{round(data1["y30"].mean(),4)*100}%')
            print("Model Percentage Error: {:.2f}%".format(percentage_error))

            
            print(f"mean_absolute_error: {mean_absolute_error(y_test, result)}")
            print(f"mean_squared_error: {mean_squared_error(y_test, result)}")
            print(f"explained_variance_score: {explained_variance_score(y_test, result)}")
            print(f"r2_score: {r2_score(y_test, result)}")

        else:
            print('模型尚未訓練，請先訓練模型')

    
    def predictPrice(self, lst):
        if os.path.isfile(self.modelpath):
            test_data = pd.read_csv(f'./{self.cityname}/test_data.csv')
            test_data.drop(['Unnamed: 0', 'y'],axis=1,inplace=True)
            predict_data = pd.DataFrame(np.array(lst)).T
            predict_data.columns = test_data.columns
            # print(predict_data)
            
            mean = self.mean.drop(['y'])
            std = self.std.drop(['y'])
            predict_data = np.array((predict_data - mean) / std)
            # print(predict_data)

            model_mlp = joblib.load(f'./{self.cityname}/model_mlp_aws.pkl')
            result = model_mlp.predict(predict_data)
            print(result)
            result = result * self.std['y'] + self.mean['y']
            return predict_data, result
        else:
            print('模型尚未訓練，請先訓練模型')


ImportError: DLL load failed while importing lib: 找不到指定的程序。

In [2]:
import awswrangler as wr
import boto3
import pandas as pd

session = boto3.Session(profile_name=aws_profile)
data = wr.s3.read_csv(f's3://house-good-job/clean_data/TPE_model_features_clean.csv', boto3_session=session)
print(type(data))

data1 = pd.read_csv(f"./TPE/TPE_model_features_clean.csv")
print(type(data1))

NameError: name 'aws_profile' is not defined

In [6]:
object=HousePriceModel_aws('TPE')
object.trainModel()

Iteration 1, loss = 0.17386338
Iteration 2, loss = 0.10823654
Iteration 3, loss = 0.09667964
Iteration 4, loss = 0.09035189
Iteration 5, loss = 0.08621533
Iteration 6, loss = 0.08304415
Iteration 7, loss = 0.08062693
Iteration 8, loss = 0.07892437
Iteration 9, loss = 0.07736565
Iteration 10, loss = 0.07608540
Iteration 11, loss = 0.07490757
Iteration 12, loss = 0.07414381
Iteration 13, loss = 0.07311098
Iteration 14, loss = 0.07230589
Iteration 15, loss = 0.07159850
Iteration 16, loss = 0.07103474
Iteration 17, loss = 0.07066855
Iteration 18, loss = 0.07008490
Iteration 19, loss = 0.06947589
Iteration 20, loss = 0.06871828
Iteration 21, loss = 0.06837818
Iteration 22, loss = 0.06818227
Iteration 23, loss = 0.06752031
Iteration 24, loss = 0.06706190
Iteration 25, loss = 0.06712825
Iteration 26, loss = 0.06624169
Iteration 27, loss = 0.06601182
Iteration 28, loss = 0.06575563
Iteration 29, loss = 0.06547660
Iteration 30, loss = 0.06512982
Iteration 31, loss = 0.06460045
Iteration 32, los

In [7]:
object=HousePriceModel_aws('KEL')
object.trainModel()

Iteration 1, loss = 0.24558874
Iteration 2, loss = 0.14658499
Iteration 3, loss = 0.12760973
Iteration 4, loss = 0.11830921
Iteration 5, loss = 0.11195783
Iteration 6, loss = 0.10798348
Iteration 7, loss = 0.10472928
Iteration 8, loss = 0.10142735
Iteration 9, loss = 0.09894034
Iteration 10, loss = 0.09685403
Iteration 11, loss = 0.09558511
Iteration 12, loss = 0.09409064
Iteration 13, loss = 0.09323292
Iteration 14, loss = 0.09182332
Iteration 15, loss = 0.09115374
Iteration 16, loss = 0.08941093
Iteration 17, loss = 0.08769877
Iteration 18, loss = 0.08727735
Iteration 19, loss = 0.08658971
Iteration 20, loss = 0.08635791
Iteration 21, loss = 0.08508478
Iteration 22, loss = 0.08430353
Iteration 23, loss = 0.08357082
Iteration 24, loss = 0.08366279
Iteration 25, loss = 0.08181599
Iteration 26, loss = 0.08135391
Iteration 27, loss = 0.08137055
Iteration 28, loss = 0.08084694
Iteration 29, loss = 0.08022249
Iteration 30, loss = 0.07925328
Iteration 31, loss = 0.07897214
Iteration 32, los

In [8]:
object=HousePriceModel_aws('NTPC')
object.trainModel()

Iteration 1, loss = 0.09884922
Iteration 2, loss = 0.07771801
Iteration 3, loss = 0.09220399
Iteration 4, loss = 0.06206811
Iteration 5, loss = 0.05982684
Iteration 6, loss = 0.05968323
Iteration 7, loss = 0.06188078
Iteration 8, loss = 0.05948468
Iteration 9, loss = 0.05939872
Iteration 10, loss = 0.05611902
Iteration 11, loss = 0.05491857
Iteration 12, loss = 0.05428133
Iteration 13, loss = 0.05405875
Iteration 14, loss = 0.05351697
Iteration 15, loss = 0.05216813
Iteration 16, loss = 0.05086524
Iteration 17, loss = 0.04977911
Iteration 18, loss = 0.04902902
Iteration 19, loss = 0.04806497
Iteration 20, loss = 0.04769516
Iteration 21, loss = 0.04736449
Iteration 22, loss = 0.04705446
Iteration 23, loss = 0.04699376
Iteration 24, loss = 0.04655509
Iteration 25, loss = 0.04618543
Iteration 26, loss = 0.04583142
Iteration 27, loss = 0.04575897
Iteration 28, loss = 0.04558076
Iteration 29, loss = 0.04549026
Iteration 30, loss = 0.04540446
Iteration 31, loss = 0.04511380
Iteration 32, los