# 함수화한 모델

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings

warnings.filterwarnings(action = 'ignore')

## 1. 데이터 로드 및 전처리

### 1.1. 데이터 로드

In [2]:
tmp_df = pd.read_csv('data.csv') # 1차 카테고리 : 5, 2차 카테고리 : 9
idx = tmp_df['동이름']
col = tmp_df.columns.tolist()[1:]
data = tmp_df[col].values
tmp_df.set_index('동이름',inplace=True)
tmp_df

Unnamed: 0_level_0,A,B,C,D,E,a,b,c,d,e,f,g,h,i
동이름,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
사직동,0.6,0.1,0.8,0.4,0.8,0.5,0.4,0.7,0.2,0.5,0.7,0.1,0.7,0.7
삼청동,0.8,0.1,0.6,0.9,0.6,1.0,0.3,0.5,0.1,0.1,0.2,0.6,0.2,0.5
부암동,0.7,0.7,0.4,0.4,0.5,0.5,0.2,0.5,0.4,0.6,0.3,0.6,1.0,0.9
평창동,0.2,0.3,0.6,0.8,0.5,0.8,0.1,0.6,1.0,0.5,0.5,0.4,0.1,0.6
무악동,0.9,0.8,0.2,1.0,0.4,0.7,0.5,0.8,0.7,0.7,0.8,0.8,0.6,0.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
성내3동,0.1,0.7,0.3,0.7,0.4,0.7,0.2,0.7,0.2,0.5,0.1,0.1,0.9,0.1
둔촌1동,0.1,0.9,0.7,0.8,0.6,0.4,0.3,1.0,0.8,0.4,0.5,0.1,0.6,0.7
둔촌2동,0.2,0.1,0.3,0.7,0.6,0.9,0.8,0.8,0.7,0.4,0.9,0.9,0.4,1.0
상일1동,0.7,0.8,0.4,0.9,0.9,0.8,0.3,0.1,0.5,0.1,0.5,0.1,0.1,0.5


### 1.2 유저 데이터 입력 및 스케일링

In [3]:
user = [6,1,8,4,8,1,0,1,0,1,1,0,1,1]
# user1 = [8,1,6,9,6,0,0,1,0,0,0,1,0,1]

In [4]:
def user_transform(user): # min_Max Scaling 함수
    user_min = min(user)
    user_max = max(user)
    
    user_scaled = []
    for u in user:
        x = (u - user_min) / (user_max - user_min)
        user_scaled.append(x)
    return user_scaled

In [5]:
user_scaled = user_transform(user)

### 1.3 가중치 matrix

In [6]:
# 임시 가중치 matrix 생성
weight = [[1,0,0,0,0,0,0.2,0.2,0,0,0,0.5,0,0.3],
          [0,1,0.3,0,0,0,0.7,0,0,0,0,0,0.1,0],
          [0,0,1,0,0,0,0.3,0,0,0,0,0,0,0],
          [0,0,0,1,0,0,0,0,0.2,0,0,0.1,0,0],
          [0,0,0,0,1,0,0,0,0,0,0.6,0,0,0],
          [0,0,0,0,0,1,0,0,0,0,0,0,0,0],
          [0,0,0.6,0,0,0,1,0,0.3,0,0,0,0,0],
          [0,0.2,0,0,0,0,0,1,0,0,0,0.7,0,0],
          [0,0,0,0,0,0,0,0,1,0,0,0.4,0,0],
          [0,0,0,0.3,0,0,0,0,0,1,0,0.2,0,0],
          [0,0,0,0,0.5,0,0,0,0,0,1,0,0,0],
          [0,0,0,0.8,0,0.3,0,0,0,0,0,1,0,0],
          [0,0,0,0,0.6,0,0,0,0.2,0,0,0,1,0],
          [0,0,0.3,0,0,0,0,0.6,0,0,0,0,0,1]]

## 2. 군집화

### 2.1 군집화 및 분류 결과 확인

In [7]:
kmeans = KMeans(n_clusters=10, init='k-means++',max_iter=300,random_state=0)
kmeans.fit(data)

In [8]:
kmeans.labels_
df = tmp_df.copy()
df['km_cluster'] = kmeans.labels_
df.groupby('km_cluster').count()

Unnamed: 0_level_0,A,B,C,D,E,a,b,c,d,e,f,g,h,i
km_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,29,29,29,29,29,29,29,29,29,29,29,29,29,29
1,62,62,62,62,62,62,62,62,62,62,62,62,62,62
2,42,42,42,42,42,42,42,42,42,42,42,42,42,42
3,45,45,45,45,45,45,45,45,45,45,45,45,45,45
4,48,48,48,48,48,48,48,48,48,48,48,48,48,48
5,33,33,33,33,33,33,33,33,33,33,33,33,33,33
6,41,41,41,41,41,41,41,41,41,41,41,41,41,41
7,43,43,43,43,43,43,43,43,43,43,43,43,43,43
8,36,36,36,36,36,36,36,36,36,36,36,36,36,36
9,47,47,47,47,47,47,47,47,47,47,47,47,47,47


### 2.2 사용자 분류

In [9]:
def user(user_scaled,user_name):
    user_df = pd.DataFrame(columns=col,index=[user_name])
    user_df.loc[user_name] = user_scaled
    cluster_user = kmeans.predict(user_df)
    user_df['km_cluster'] = cluster_user
    return user_df

In [10]:
user_df = user(user_scaled,'user')
user_df

Unnamed: 0,A,B,C,D,E,a,b,c,d,e,f,g,h,i,km_cluster
user,0.75,0.125,1.0,0.5,1.0,0.125,0.0,0.125,0.0,0.125,0.125,0.0,0.125,0.125,1


In [11]:
# 유저의 군집에 해당하는 지역 추출 함수
def recommand_area(df, user_df, user_name): # 지역 데이터, 유저 데이터, 유저 이름
    rc_area = user_df['km_cluster'].loc[user_name] # 해당 유저의 군집
    rc_seoul = df[df['km_cluster'] == rc_area] # 서울시 내 군집 지역 추출
    rc_list = rc_seoul.index.tolist() # 추천 리스트
    return rc_seoul, rc_list

In [12]:
rc_seoul = recommand_area(df,user_df,'user')[0]
rc_list = recommand_area(df,user_df,'user')[1] 

In [13]:
rc_list # 'user'가 해당하는 군집에 속한 서울 동 리스트

['삼청동',
 '혜화동',
 '명동',
 '필동',
 '약수동',
 '동화동',
 '이태원2동',
 '원효로1동',
 '성수1가2동',
 '능동',
 '청량리동',
 '중화2동',
 '면목3.8동',
 '인수동',
 '창3동',
 '도봉2동',
 '월계2동',
 '중계1동',
 '녹번동',
 '대조동',
 '응암2동',
 '천연동',
 '홍제3동',
 '남가좌1동',
 '용강동',
 '대흥동',
 '아현동',
 '신월3동',
 '신월6동',
 '등촌1동',
 '공항동',
 '방화1동',
 '구로2동',
 '개봉2동',
 '가리봉동',
 '당산2동',
 '신길3동',
 '신길5동',
 '신길6동',
 '대림3동',
 '영등포본동',
 '상도4동',
 '신대방2동',
 '사당2동',
 '청림동',
 '낙성대동',
 '중앙동',
 '조원동',
 '내곡동',
 '대치4동',
 '도곡2동',
 '일원본동',
 '압구정동',
 '청담동',
 '오륜동',
 '장지동',
 '잠실3동',
 '잠실4동',
 '길동',
 '강일동',
 '암사2동',
 '성내2동']

### 2.3 유사도 측정

In [14]:
def similarity(user_df, df, user_name, num): # 유저 데이터, 유사도 측정을 위한 데이터, 유저 이름, 원하는 순위
    con_data = pd.concat([user_df.loc[[user_name]],df])
    rc_sim = cosine_similarity(con_data,con_data)
    sim_matrix = pd.DataFrame(rc_sim,columns=con_data.index).loc[[0]].T
    rank = sim_matrix[0].sort_values(ascending=False) # 유사도 순서로 정렬
    ranking = rank[1:num+1].index.tolist() # 1~n 위 리스트
    return ranking

In [15]:
ranking = similarity(user_df,df,'user',4)
cluster_rank = similarity(user_df,rc_seoul,'user',4)
cluster_rank

['청림동', '개봉2동', '대치4동', '월계2동']

**겹치는 지역 제거**

In [16]:
def concat_df(rank_1,rank_2): # 두 집단 병합 및 겹침 제거 함수
    tmp_1 = df.loc[rank_1]
    tmp_2 = df.loc[rank_2]
    x = pd.concat([tmp_1,tmp_2])
    x.reset_index(inplace=True)
    x = x.drop_duplicates(['동이름'])
    x.loc[:4]
    x.set_index('동이름',inplace=True)
    del(x['km_cluster'])
    return x

In [17]:
con_df = concat_df(ranking,cluster_rank)
con_df

Unnamed: 0_level_0,A,B,C,D,E,a,b,c,d,e,f,g,h,i
동이름,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
청림동,0.9,0.2,0.9,0.7,0.9,0.5,0.4,0.4,0.4,0.6,0.4,0.3,0.1,0.8
개봉2동,1.0,0.3,1.0,0.7,0.8,0.7,0.1,0.3,0.8,0.3,0.3,0.4,0.3,0.4
대치4동,0.6,0.5,0.5,0.2,1.0,0.2,0.5,0.2,0.1,0.6,0.1,0.2,0.2,0.1
월계2동,0.9,0.1,1.0,0.1,0.8,0.5,0.1,0.4,0.3,0.6,0.9,0.5,0.3,0.2


In [18]:
def weighting(df, weight): # 가중치 부여 된 데이터 프레임 생성 함수
    data = df.values
    w = [0] * 14
    for i in range(len(weight)):
        if(user_df.loc['user'][i] != 0):
            for k in range(len(weight)):
                w[i] += weight[i][k]
    modified = []
    for k in range(len(data)):
        modi = []
        for i in range(len(w)):
            mod = data[k][i] * (1 + w[i])
            modi.append(mod)
        modified.append(modi)
    weighted_df = pd.DataFrame(modified,index=df.index,columns=col)
    return weighted_df

In [19]:
weighted_df = weighting(con_df,weight)
weighted_df

Unnamed: 0_level_0,A,B,C,D,E,a,b,c,d,e,f,g,h,i
동이름,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
청림동,2.88,0.62,2.07,1.61,2.34,1.0,0.4,1.16,0.4,1.5,1.0,0.3,0.28,2.32
개봉2동,3.2,0.93,2.3,1.61,2.08,1.4,0.1,0.87,0.8,0.75,0.75,0.4,0.84,1.16
대치4동,1.92,1.55,1.15,0.46,2.6,0.4,0.5,0.58,0.1,1.5,0.25,0.2,0.56,0.29
월계2동,2.88,0.31,2.3,0.23,2.08,1.0,0.1,1.16,0.3,1.5,2.25,0.5,0.84,0.58


In [20]:
def ranking(weighted_df): # 가중치 데이터 프레임으로 3위까지
    sums = []
    for i in range(len(weighted_df.index)):
        sums.append(weighted_df.iloc[i].sum())
    weighted_df['sum'] = sums
    oh = weighted_df.T.loc['sum'].sort_values(ascending=False)
    x = pd.DataFrame(oh).index.tolist()[0:3]
    return x

In [21]:
x = ranking(weighted_df)
x

['청림동', '개봉2동', '월계2동']