<a href="https://colab.research.google.com/github/rbdus0715/Machine-Learning/blob/main/competitions/Predict-CO2-Emissions-in-Rwanda/modeling2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

seed = 2023
pd.set_option('display.max_columns', None)

In [2]:
path = 'drive/MyDrive/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

## (1) 피처 엔지니어링 - 군집화

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

def kmeans_clustering(df, cluster_num, max_iter=1000):
    scaler = StandardScaler()
    df_std = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)

    km_model = KMeans(n_clusters=cluster_num, max_iter=max_iter, random_state=666)
    km_model.fit(df_std)

    cluster_df = pd.DataFrame(data=km_model.labels_, columns=['ClusterNo'], index=df.index)

    return cluster_df

In [4]:
emission_series = train.pivot_table(
    index=['year', 'week_no'],
    columns=['longitude', 'latitude'],
    values='emission'
)

emission_series = (emission_series - emission_series.min()) / (emission_series.max() - emission_series.min())
emission_series = emission_series.dropna(axis=1)

In [5]:
df_cluster = emission_series.copy()
df_cluster = df_cluster.T
cluster_num = 8

df_cluster['ClusterNo'] = kmeans_clustering(df=df_cluster, cluster_num=cluster_num, max_iter=10000)



In [6]:
train = train.merge(df_cluster['ClusterNo'].reset_index(), on=['longitude', 'latitude'], how='left').fillna(-1)
test = test.merge(df_cluster['ClusterNo'].reset_index(),on=['longitude','latitude'],how='left').fillna(-1)

## (2) 모델링

In [7]:
# 위도 경도를 하나의 id로 만드는 함수
def get_id(row):
    return int(''.join(filter(str.isdigit, str(row['latitude']))) + ''.join(filter(str.isdigit, str(row['longitude']))))

# 람다함수로 적용하여 모든 위치에 대해 id로 대응
train['id'] = train[['latitude', 'longitude']].apply(lambda row: get_id(row), axis=1)
test['id'] = test[['latitude', 'longitude']].apply(lambda row: get_id(row), axis=1)

# 새로운 유니크한 아이디로 매핑
new_ids = {id_: new_id for new_id, id_ in enumerate((train['id']).unique())}
train['id'] = train['id'].map(new_ids)
test['id'] = test['id'].map(new_ids)

In [8]:
pip install haversine



In [9]:
rwanda_center = (-1.9607, 29.9707)
park_biega = (-1.8866, 28.4518)
kirumba = (-0.5658, 29.1714)
massif = (-2.9677, 28.6469)
lake = (-1.9277, 31.4346)
mbarara = (-0.692, 30.602)
muy = (-2.8374, 30.3346)

In [10]:
from haversine import haversine
import datetime as dt
import numpy as np

def cluster_features(df, cluster_centers):
    # 군집의 중앙 좌표 정보 순회
    for i, cc in enumerate(cluster_centers.values()):
        # 군집의 중앙과 해당 군집 원소들 사이의 거리를 구함
        df[f'cluster_{i}'] = df.apply(lambda x: haversine((x['latitude'], x['longitude']), cc, unit='ft'), axis=1)
    return df

def get_month(row):
    # lambda 함수로 사용됨
    # dataframe의 한 row에 대해 year, week_no 정보를 이용하여 datetime 객체를 만듦
    date = dt.datetime.strptime(f'{row["year"]}-{row["week_no"]+1}-1', "%Y-%W-%w")
    return date.month

def coor_rotation(df):
    df['rot_15_x'] = (np.cos(np.radians(15)) * df['longitude']) + \
                     (np.sin(np.radians(15)) * df['latitude'])

    df['rot_15_y'] = (np.cos(np.radians(15)) * df['latitude']) + \
                     (np.sin(np.radians(15)) * df['longitude'])

    df['rot_30_x'] = (np.cos(np.radians(30)) * df['longitude']) + \
                     (np.sin(np.radians(30)) * df['latitude'])

    df['rot_30_y'] = (np.cos(np.radians(30)) * df['latitude']) + \
                     (np.sin(np.radians(30)) * df['longitude'])
    return df

In [11]:
y = train['emission']

# 피처 엔지니어링
def preprocessing(df):
    cols_save = ['id', 'latitude', 'longitude', 'year', 'week_no', 'Ozone_solar_azimuth_angle', 'ClusterNo']
    df = df[cols_save]

    # 지역, 연도로 그룹화한 후, 특정 칼럼 칼럼의 결측지 채워넣기
    good_col = 'Ozone_solar_azimuth_angle'
    df[good_col] = df.groupby(['id', 'year'])[good_col].ffill().bfill()

    # 위의 값을 아래로 한칸씩 이동시킨 후 널값 제거
    df[f'{good_col}_lag_1'] = df.groupby(['id', 'year'])[good_col].shift(1).fillna(0)

    # 15도, 30도 회전시킨 결과 구하기
    df = coor_rotation(df)

    for col, coors in zip(
        ['dist_rwanda', 'dist_park', 'dist_kirumba', 'dist_massif', 'dist_lake', 'dist_mbarara', 'dist_muy'],
        [rwanda_center, park_biega, kirumba, massif, lake, mbarara, muy]
    ):
        # 리스트 안을 순회하며 거리 계산
        df[col] = df.apply(lambda x: haversine((x['latitude'], x['longitude']), coors, unit='ft'), axis=1)


    # 월 계산
    df['month'] = df[['year', 'week_no']].apply(lambda row: get_month(row), axis=1)
    # 해당 년도가 코로나 유행 기간이었는지
    df['is_covid'] = (df['year'] == 2020) & (df['month'] > 2) | (df['year'] == 2021) & (df['month'] == 1)
    # 봉쇄령 기간이었는지
    df['is_lockdown'] = (df['year'] == 2020) & ((df['month'].isin([3,4])))
    # 코로나 최고점
    df['is_covid_peak'] = (df['year'] == 2020) & ((df['month'].isin([4,5,6])))
    # 코로나 감소세
    df['is_covid_dis_peak'] = (df['year'] == 2021) & ((df['month'].isin([7,8,9])))
    # 공휴일
    df['public_holidays'] = (df['week_no'].isin([0, 51, 12, 30]))

    df.fillna(0, inplace=True)
    return df

In [None]:
train = preprocessing(train)
test = preprocessing(test)

df = pd.concat([train, test], axis=0, ignore_index=True)

# 위도 경도 군집화 (좌표에 따른)
coordinates = df[['latitude', 'longitude']].values
clustering = KMeans(n_clusters=12, max_iter=1000, random_state=seed).fit(coordinates)
cluster_centers = {i: tuple(centroid) for i, centroid in enumerate(clustering.cluster_centers_)}
df = cluster_features(df, cluster_centers)

train = df.iloc[:-len(test), :]
test = df.iloc[:len(test), :]
del df

X = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [13]:
from sklearn.ensemble import RandomForestRegressor

final_preds = np.zeros(len(test))
train['emission'] = y

rf = RandomForestRegressor(n_estimators=1000, random_state=seed, n_jobs=-1, verbose=1)
rf.fit(X, y)
final_preds = rf.predict(test)

final_preds

array([  3.74822333,   4.06070595,   4.23404496, ..., 124.06861931,
       123.9804968 , 124.00352398])

In [15]:
ss = pd.read_csv(path + 'sample_submission.csv')

In [16]:
ss['emission'] = final_preds

## (3) 후처리

In [18]:
ss['id'] = np.array(train[(train['week_no']<49) & (train['year']==2021)]['id'])
ss['week_no'] = ss.groupby('id').cumcount()

In [19]:
coeffs_pred_cluster = [1.10, #ClusterNo 0
                       1.02, #ClusterNo 1
                       1.10, #ClusterNo 2
                       1.07, #ClusterNo 3
                       1.10, #ClusterNo 4
                       1.05, #ClusterNo 5
                       1.07, #ClusterNo 6
                       1.07, #7
                      ]

test = test.reset_index(drop=True)
for ClusterNo in range(8):
    ss.loc[test['ClusterNo']==ClusterNo, 'emission'] = ss.loc[test['ClusterNo']==ClusterNo, 'emission']*coeffs_pred_cluster[ClusterNo]

In [21]:
ss.loc[test['longitude']==29.321, 'emission'] = train.loc[(train['year']==2021)&(train['week_no']<=48)&(train['longitude']==29.321),'emission'].values

coeff_29222 = 1.10
ss.loc[test['longitude']==29.222, 'emission'] = pd.Series(final_preds).loc[test['longitude']==29.222].values * coeff_29222

ValueError: ignored

In [22]:
coeff_low_values = 0.962

coeff_hi_values = 0.995

ss.loc[(test['ClusterNo'].isin([0,4])), 'emission'] = coeff_hi_values * ss.loc[(test['ClusterNo'].isin([0,4])), 'emission']

ss.loc[(test['ClusterNo'].isin([0,4]))&(test['week_no']<=13), 'emission'] = coeff_low_values * ss.loc[(test['ClusterNo'].isin([0,4]))&(test['week_no']<=13), 'emission']
ss.loc[(test['ClusterNo'].isin([0,4]))&(test['week_no']>=17)&(test['week_no']<=39), 'emission'] = coeff_low_values * ss.loc[(test['ClusterNo'].isin([0,4]))&(test['week_no']>=17)&(test['week_no']<=39), 'emission']
ss.loc[(test['ClusterNo'].isin([0,4]))&(test['week_no']>=44), 'emission'] = coeff_low_values * ss.loc[(test['ClusterNo'].isin([0,4]))&(test['week_no']>=44), 'emission']

In [23]:
test['id'] = test[['latitude', 'longitude']].apply(lambda row: get_id(row), axis=1)
test['id'] = test['id'].map(new_ids)

ss.loc[(test['ClusterNo'].isin([0,4]))&(test['week_no']==13), 'emission'] = np.nan
ss = ss.fillna(method='bfill')

ss.loc[(test['ClusterNo'].isin([0,4]))&(test['week_no']==17), 'emission'] = np.nan
ss = ss.fillna(method='ffill')

ss.loc[(test['ClusterNo'].isin([0,4]))&(test['week_no']==39), 'emission'] = np.nan
ss = ss.fillna(method='ffill')

ss.loc[(test['ClusterNo'].isin([0,4]))&(test['week_no']==44), 'emission'] = np.nan
ss = ss.fillna(method='bfill')

In [24]:
ss.drop(['id', 'week_no'], axis=1, inplace=True)
ss.to_csv('submission.csv', index=False)