In [13]:
import pandas as pd
import numpy as np
from hmmlearn.hmm import CategoricalHMM

from sklearn.model_selection import train_test_split

import pymongo

In [14]:
# data 기본 경로
ABSOLUTE_PATH = "C:\\Users\\rudnf\\vscode\\Graduation\\Data_Preprocessing\\data\\"
SEASONS = ['spring', 'summer', 'fall', 'winter']
MONTHS = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "12", "12"]

KS = [25, 50, 75, 100]

MONGO_URI = "mongodb://localhost:27017/"


In [15]:
def season_match(season):
    months = []
    if season == "spring":
        months = ["03", "04", "05"]
    if season == "summer":
        months = ["06", "07", "08"]
    if season == "fall":
        months = ["09", "10", "11"]
    if season == "winter":
        months = ["12", "01", "02"]
    return months

In [16]:
def label_sequence(POI_sequence, time_period_sequence):
    l_sequence = []
    for idx in range(0, len(POI_sequence)):
        LID = int(POI_sequence[idx].replace('POI', ''))
        T = 0
        
        if time_period_sequence[idx] == 'dawn':
            T = 0
        elif time_period_sequence[idx] == 'morning':
            T = 1
        if time_period_sequence[idx] == 'afternoon':
            T = 2
        elif time_period_sequence[idx] == 'night':
            T = 3
            
        l_sequence.append( LID * 4 + T )
    return l_sequence

In [17]:
import ast
def df_str_to_list(df):
    try:
        columns = ['trajectory_id', 'POI_sequence', 'time_period_sequence']
        for column in columns:
            df[column] = df[column].apply(ast.literal_eval)
        return df
    except (ValueError, SyntaxError):
        print("Invalid input string format.")
        

In [18]:
def calculate_initial_probability(X, n_components, smoothing_factor=1.0):
    start_count = np.zeros(n_components)
    start_proba = np.zeros(n_components)

    # start_point
    for value in X[:, 1]:
        start_count[int(value.replace("POI", ""))] += 1

    total = np.sum(start_count)
    
    # Laplace Smoothing
    start_proba = (start_count + smoothing_factor) / (total + n_components)
    return start_proba


In [19]:
def calculate_transition_probability(X, n_components, smoothing_factor=1.0):
    transition_count = np.zeros((n_components, n_components))  
    transition_proba = np.zeros((n_components, n_components))
    
    # transition 횟수 count
    # X[:, 3] : 'POI_sequence'
    for trajectory in X[:, 3]:

        for i in range(len(trajectory) - 1):
            cur_POI_num = int(trajectory[i].replace('POI',''))
            next_POI_num = int(trajectory[i+1].replace('POI',''))
            
            transition_count[cur_POI_num, next_POI_num] += 1
            

    for i in range(len(transition_count)):
        total = np.sum(transition_count[i])
        
        # Laplace Smoothing
        transition_proba[i] = (transition_count[i] + smoothing_factor) / (total + n_components)
    return transition_proba


In [20]:
def calculate_emission_probability(X, n_components, n_features, smoothing_factor=1.0):
    emission_count = np.zeros(shape=(n_components, n_features))
    emission_proba = np.zeros(shape=(n_components, n_features))
 
    # X[:, [3,5]] : ['POI_sequence','l_sequence']
    for p, l_seq in X[:, [3,5]]:
        for cur_POI_num, cur_l_seq in zip(p, l_seq):
            cur_POI_num = int(cur_POI_num.replace('POI',''))
            emission_count[cur_POI_num][cur_l_seq] +=1
        

    for i in range(len(emission_count)):
        total = np.sum(emission_count[i])
        
        # Laplace Smoothing
        emission_proba[i] = (emission_count[i] + smoothing_factor) / (total + n_features)

    return emission_proba


In [21]:
def _get_collection_data(mongo_uri, db_name, collection_name):
    client = pymongo.MongoClient(mongo_uri)

    db = client[db_name]
    collection = db[collection_name]

    data = list(collection.find({}, {"_id": 0}))

    client.close()

    return data


In [22]:
def get_trajectory(k):
    db_name = "total"

    trajectory_name = f"trajectory_total_cluster_{k}"
    data = _get_collection_data(MONGO_URI, db_name, trajectory_name)
 
    return pd.DataFrame(data)
    

In [23]:
def model_to_DB(hmm_model, k):
    db_name = "total_model" 
    collection_name = "hmm_model"

    # MongoDB에 연결
    client = pymongo.MongoClient(MONGO_URI)
    db = client[db_name]
    collection = db[collection_name]

    model_params = {
        "startprob": hmm_model.startprob_.tolist(),
        "transmat": hmm_model.transmat_.tolist(),
        "emissionprob": hmm_model.emissionprob_.tolist(),
    }

    # MongoDB에 모델 저장
    collection.insert_one({"model_name": f"total_cluster_{k}_model", "model_params": model_params})

    # 연결 종료
    client.close()

In [26]:
for k in KS:
    print(f"K : {k}")
    # time_period 수
    TIME_PERIOD = 4
    # cluster 수 : Hidden State수
    n_components = k

    n_features = n_components * TIME_PERIOD

    trajectories = get_trajectory(k)
    trajectories = df_str_to_list(trajectories)

    trajectories['l_sequence'] = [np.array(label_sequence(path, time_period)) for path, time_period in trajectories[['POI_sequence', 'time_period_sequence']].values]

    select_columns = ['trajectory_id', 'start_point', 'end_point', 'POI_sequence', 'time_period_sequence', 'l_sequence']
    X = np.array(trajectories[select_columns])

    hmm_model = CategoricalHMM(n_components=n_components, n_features=n_features)

    hmm_model.startprob_ = calculate_initial_probability(X, n_components)
    hmm_model.transmat_ = calculate_transition_probability(X, n_components)
    hmm_model.emissionprob_ = calculate_emission_probability(X, n_components, n_features)
    
    model_to_DB(hmm_model, k)


K : 25
K : 50
K : 75
K : 100
