### boto3 呼叫 aws bucket

參考資料：

https://dev.to/gbhorwood/managing-your-aws-resources-with-python-and-boto3-1ceb

https://dev.to/aws-builders/how-to-list-contents-of-s3-bucket-using-boto3-python-47mm

https://boto3.amazonaws.com/v1/documentation/api/latest/guide/session.html

https://www.learnaws.org/2022/09/27/pandas-read-s3/

https://aws-sdk-pandas.readthedocs.io/en/stable/tutorials/002%20-%20Sessions.html

In [3]:


import sys
import boto3

# configuration

# --> NEW
# the name of the aws profile
aws_profile = "house-good-job_john"

# --> NEW
# aws credentials read in from ~/.aws/credentials
aws_access_key_id = None
aws_secret_access_key = None
aws_region = "ap-northeast-1"


# --> NEW
def get_aws_credentials():
    """
    Gets the aws credentials from ~/.aws/credentials
    for the aws profile name
    """
    import configparser
    import os

    # the aws profile we configured
    global aws_profile

    # the global variables where we store the aws credentials
    global aws_access_key_id
    global aws_secret_access_key

    # parse the aws credentials file
    path = os.environ['HOME'] + '/.aws/credentials'
    config = configparser.ConfigParser()
    config.read(path)

    # read in the aws_access_key_id and the aws_secret_access_key
    # if the profile does not exist, error and exit
    if aws_profile in config.sections():
        aws_access_key_id = config[aws_profile]['aws_access_key_id']
        aws_secret_access_key = config[aws_profile]['aws_secret_access_key']
    else:
        print("Cannot find profile '{}' in {}".format(aws_profile, path), True)
        sys.exit()

    # if we don't have both the access and secret key, error and exit
    if aws_access_key_id is None or aws_secret_access_key is None:
        print("AWS config values not set in '{}' in {}".format(aws_profile, path), True)
        sys.exit()



In [4]:
# aws_crentials confirmation

# get_aws_credentials()
# print(aws_profile)
# print(aws_access_key_id)
# print(aws_secret_access_key)

In [5]:
import boto3

session = boto3.Session(profile_name=aws_profile)
s3_client = session.client('s3')

objects = s3_client.list_objects_v2(Bucket='house-good-job')

for obj in objects['Contents']:
    print(obj['Key'])

clean_data/
clean_data/KEL_model_features_clean.csv
clean_data/NTPC_model_features_clean.csv
clean_data/TPE_model_features_clean.csv


In [10]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
import joblib, os
import matplotlib.pyplot as plt
import seaborn as sns
import awswrangler as wr
import boto3

class HousePriceModel_aws:

    def __init__(self, c):
        self.cityname = c
        self.modelpath = f'./{self.cityname}/model_mlp_aws.pkl'
        aws_profile = "house-good-job_john"
        session = boto3.Session(profile_name=aws_profile)
        data = wr.s3.read_csv(f's3://house-good-job/clean_data/{self.cityname}_model_features_clean.csv', boto3_session=session)
        # data=pd.read_csv(f'./{self.cityname}_model_features_clean.csv')
        # 單熱編碼
        data_class = pd.get_dummies(data['鄉鎮市區'])
        data_class.columns = ['鄉鎮市區_' + str(x) for x in data_class.columns]
        data = pd.concat([data, data_class], axis = 1)

        # 刪除資料分類用欄位
        data.insert(data.shape[1], 'y', data['單價元平方公尺'])
        data.drop(['單價元平方公尺', 'Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.2', 
        '鄉鎮市區', 'geometry'],axis=1,inplace=True)

        # 低變異過濾
        data = data.loc[:, data.std() > 0]
        data = data.dropna()
        self.feature_count = data.shape[1]

        # 訓練集 & 測試集
        test_data = data.loc[data['交易年份'] >= 111]
        test_data.to_csv(f'./{self.cityname}/test_data.csv')
        train_data =  data.loc[data['交易年份'] < 111]
        
        # 資料標準化
        self.mean = train_data.mean()
        self.std = train_data.std()
        self.train_data = (train_data-self.mean)/self.std

        
    def trainModel(self):
        X_train = np.array(self.train_data.drop('y', axis='columns'))
        y_train = np.array(self.train_data['y'])

        model_mlp = MLPRegressor(random_state=14,max_iter = 400,activation='relu', hidden_layer_sizes=(int(self.feature_count*1/2),int(self.feature_count*1/4)))
        model_mlp.fit(X_train, y_train)
        mlp_score=model_mlp.score(X_train,y_train)

        joblib.dump(model_mlp, f'./{self.cityname}/model_mlp_aws.pkl')
        print(f'model score: {mlp_score}')

    def testModel(self, testfile):
        if os.path.isfile(self.modelpath):
            test_data = pd.read_csv(f'./{self.cityname}/{testfile}.csv')
            test_data.drop(['Unnamed: 0'],axis=1,inplace=True)
            
            test_data = (test_data - self.mean) / self.std
            X_test = np.array(test_data.drop('y', axis='columns'))
            y_test = np.array(test_data['y'])

            
            model_mlp = joblib.load(f'./{self.cityname}/model_mlp_aws.pkl')
            result = model_mlp.predict(X_test)
            fig = plt.figure(figsize=(10,5))
            residuals = (y_test * self.std['y'] + self.mean['y'])- (result * self.std['y'] + self.mean['y'])
            pd.DataFrame(residuals).to_csv(f'./{self.cityname}/residuals.csv')

            data1 = pd.DataFrame({'origin':y_test * self.std['y'] + self.mean['y'],'predict':result* self.std['y'] + self.mean['y'],
                                'residual':(y_test * self.std['y'] + self.mean['y']) - (result* self.std['y'] + self.mean['y'])})
            percentage_error = np.mean(np.abs(data1['origin'] - data1['predict'])) / np.mean(data1['origin']) * 100
            data1['residual_abs'] = data1['residual'].abs()
            data1['y10'] = data1['origin'] / 10 - data1['residual_abs']
            data1['y20'] = data1['origin'] / 5 - data1['residual_abs']
            data1['y30'] = data1['origin'] / 3.333 - data1['residual_abs']
            data1.loc[data1['y10'] >= 0, 'y10'] = 1
            data1.loc[data1['y10'] < 0 , 'y10'] = 0
            data1.loc[data1['y20'] >= 0, 'y20'] = 1
            data1.loc[data1['y20'] < 0 , 'y20'] = 0
            data1.loc[data1['y30'] >= 0, 'y30'] = 1
            data1.loc[data1['y30'] < 0 , 'y30'] = 0
            

            print(f'=========={self.cityname}==========')
            print(f'預測房價落在實際房價+-10%內的機率為:{round(data1["y10"].mean(),4)*100}%')
            print(f'預測房價落在實際房價+-20%內的機率為:{round(data1["y20"].mean(),4)*100}%')
            print(f'預測房價落在實際房價+-30%內的機率為:{round(data1["y30"].mean(),4)*100}%')
            print("Model Percentage Error: {:.2f}%".format(percentage_error))

            
            print(f"mean_absolute_error: {mean_absolute_error(y_test, result)}")
            print(f"mean_squared_error: {mean_squared_error(y_test, result)}")
            print(f"explained_variance_score: {explained_variance_score(y_test, result)}")
            print(f"r2_score: {r2_score(y_test, result)}")

        else:
            print('模型尚未訓練，請先訓練模型')

    
    def predictPrice(self, lst):
        if os.path.isfile(self.modelpath):
            test_data = pd.read_csv(f'./{self.cityname}/test_data.csv')
            test_data.drop(['Unnamed: 0', 'y'],axis=1,inplace=True)
            predict_data = pd.DataFrame(np.array(lst)).T
            predict_data.columns = test_data.columns
            # print(predict_data)
            
            mean = self.mean.drop(['y'])
            std = self.std.drop(['y'])
            predict_data = np.array((predict_data - mean) / std)
            # print(predict_data)

            model_mlp = joblib.load(f'./{self.cityname}/model_mlp_aws.pkl')
            result = model_mlp.predict(predict_data)
            print(result)
            result = result * self.std['y'] + self.mean['y']
            return predict_data, result
        else:
            print('模型尚未訓練，請先訓練模型')


In [8]:
import awswrangler as wr
import boto3
import pandas as pd

session = boto3.Session(profile_name=aws_profile)
data = wr.s3.read_csv(f's3://house-good-job/clean_data/TPE_model_features_clean.csv', boto3_session=session)
print(type(data))

data1 = pd.read_csv(f"./TPE_model_features_clean.csv")
print(type(data1))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [12]:
object=HousePriceModel_aws('TPE')
object.trainModel()

model score: 0.8768887163456685


