In [2]:
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import os
from pprint import pprint

%matplotlib tk
import matplotlib.pylab as plt

import nltk
nltk.download("stopwords")
nltk.download('wordnet')

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SEHWA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SEHWA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
#트룹 읽어오기
path_dir = 'E:/공모전/TropeFlim'
file_list = os.listdir(path_dir)

mydoclist = []

for item in file_list:
    mydoclist.append(item)
    
trope_list = []

for item in mydoclist:
    f = open("E:/공모전/Trope/" + item , 'r')
    temp = ""
    while True:
        line = f.readline()
        line = line.replace('\n', '')
        if not line : break
        else : 
            temp += line + " "
    trope_list.append(temp)

In [4]:
#각 영화의 트룹들 토큰화 & 불용어 제거를 위해 소문자화
movie_tropes = []
for i in range(0, len(trope_list)):
    trope_word = trope_list[i].lower().split(' ')
    movie_tropes.append(trope_word)

In [5]:
#텍스트(영어) 전처리(Lemmatization 음소표기법)
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

words = []
for i in range(0, len(movie_tropes)):
    words.append([wordnet_lemmatizer.lemmatize(w) for w in movie_tropes[i]])

In [6]:
#불용어 제거
for i in range(0, len(words)):
    words[i] = [w for w in words[i] if not w in stopwords.words('english')]

In [7]:
#토큰화된 단어들 붙이기
movie_list = []
for i in range(0, len(words)):
    trope = ""
    for j in range(0, len(words[i])):
        trope += words[i][j] + " "
    movie_list.append(trope)

In [8]:
#TF-IDF Matrix 생성
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movie_list)

#numpy array로 변환 (sparse matrix 이므로 todense() 함수 호출)
X = np.array(tfidf_matrix.todense())

In [9]:
#PCA Scree Plot으로 주성분 수 탐색(임의로 주성분 6개로 탐색)
from sklearn.decomposition import PCA

pca = PCA(n_components=6)
pca.fit(X)

Z = pca.transform(X)

plt.plot(pca.explained_variance_ratio_)
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()

In [10]:
#주성분 수 2개로 PCA
pca = PCA(n_components=2)
pca.fit(X)

Z = pca.transform(X)

In [12]:
#K-means Clustering Scree Plot으로 클러스터 수 탐색
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans

distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(Z)
    kmeanModel.fit(Z)
    distortions.append(sum(np.min(cdist(Z, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / Z.shape[0])

# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [13]:
#K-means Clustering
num_clusters = 4

km = KMeans(n_clusters=num_clusters)
km.fit(Z)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [14]:
#시각화
import pandas as pd

results = pd.DataFrame(Z)
results['category'] = km.labels_
results

colormap = { 0: 'red', 1: 'green', 2: 'blue', 3: 'purple'}
colors = results.apply(lambda row: colormap[row.category], axis=1)
ax = results.plot(kind='scatter', x=0, y=1, alpha=0.1, s=300, c=colors)

In [15]:
#불필요한 트룹 뽑아내기
read_tfidf_matrix = pd.read_csv('tfidf_matrix.csv', engine='python')

pca = PCA(n_components=2, svd_solver='full')
pca.fit(read_tfidf_matrix)

xvector = pca.components_[0] # see 'prcomp(my_data)$rotation' in R
yvector = pca.components_[1]

xs = pca.transform(read_tfidf_matrix)[:,0] # see 'prcomp(my_data)$x' in R
ys = pca.transform(read_tfidf_matrix)[:,1]



#biplot 그리기
for i in range(len(xvector)):
# arrows project features (ie columns from csv) as vectors onto PC axes
    plt.arrow(0, 0, xvector[i]*max(xs), yvector[i]*max(ys),
              color='r', width=0.0005, head_width=0.0025)
    plt.text(xvector[i]*max(xs)*1.2, yvector[i]*max(ys)*1.2,
             list(read_tfidf_matrix.columns.values)[i], color='r')

for i in range(len(xs)):
# circles project documents (ie rows from csv) as points onto PC axes
    plt.plot(xs[i], ys[i], 'bo')
    plt.text(xs[i]*1.2, ys[i]*1.2, list(read_tfidf_matrix.index)[i], color='b')

plt.show()

In [18]:
components = pd.DataFrame(pca.components_, columns = read_tfidf_matrix.columns, index=[0, 1])
components

components.to_excel('E:/공모전/PCA Components.xlsx', sheet_name='sheet1')