- Spam Mail 구분
- Overview + 감독 + 주연배우 가지고 cosine_similarity

1. 데이터 전처리
2. 텍스트 전처리
3. Feature 벡터화
4. 분류 모델 만들기
5. 예측 및 평가

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import re

# spam 데이터

In [42]:
df = pd.read_csv('spam.csv', encoding='latin-1', sep=',')
df = df[['v1','v2']]
df.columns = ['label', 'content']

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   content  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


###1.데이터 전처리, 텍스트 전처리

In [43]:
df.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [44]:
df.content[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [45]:
df.content.nunique()

5169

In [46]:
df.drop_duplicates(subset=['content'], inplace=True)
df.nunique()

label         2
content    5169
dtype: int64

In [47]:
df.isnull().sum()

label      0
content    0
dtype: int64

In [48]:
df['content'] = df.content.str.replace('[^A-Za-z ]', '').str.lower()

In [49]:
df.content[0]

'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'

###2.Feature 벡터화

In [50]:
tvect = TfidfVectorizer()
X = tvect.fit_transform(df.content)
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, stratify=y, random_state=2021)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3618, 8513), (3618,), (1551, 8513), (1551,))

In [51]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [52]:
preds = lr.predict(X_test)
score = accuracy_score(y_test, preds)
score

0.9509993552546744

# Credit 데이터 + Meta 데이터

### Cosine 유사도를 이용한 영화 추천 시스템

- credit dataset

In [4]:
credit_df = pd.read_csv('credits.csv')
credit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  object
 1   crew    45476 non-null  object
 2   id      45476 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [5]:
credit_df.head(2)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844


1.데이터 전처리

In [6]:
credit_df.crew[0]

'[{\'credit_id\': \'52fe4284c3a36847f8024f49\', \'department\': \'Directing\', \'gender\': 2, \'id\': 7879, \'job\': \'Director\', \'name\': \'John Lasseter\', \'profile_path\': \'/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f4f\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12891, \'job\': \'Screenplay\', \'name\': \'Joss Whedon\', \'profile_path\': \'/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f55\', \'department\': \'Writing\', \'gender\': 2, \'id\': 7, \'job\': \'Screenplay\', \'name\': \'Andrew Stanton\', \'profile_path\': \'/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f5b\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12892, \'job\': \'Screenplay\', \'name\': \'Joel Cohen\', \'profile_path\': \'/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f61\', \'department\': \'Writing\', \'gender\': 0, \'id\': 12893, \'job\': \'Screenplay\', \'name\': \'A

In [8]:
credit_df['cast'] = credit_df.cast.astype(str).apply(lambda x : eval(x))
credit_df['crew'] = credit_df.crew.astype(str).apply(lambda x : eval(x))

In [10]:
drop_main = []

for i in credit_df.index:
    try:   
        actor = credit_df.loc[i, 'cast'][0]['name']
        director = credit_df.loc[i, 'crew'][0]['name']
        credit_df.loc[i, 'actor'] = actor
        credit_df.loc[i, 'director'] = director
    except:
        drop_main.append(i)

In [11]:
credit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   cast      45476 non-null  object
 1   crew      45476 non-null  object
 2   id        45476 non-null  int64 
 3   actor     42708 non-null  object
 4   director  42708 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.7+ MB


In [12]:
credit_df = credit_df[['id','director','actor']]
credit_df.dropna(inplace=True)
credit_df.reset_index(drop=True, inplace=True)
credit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42708 entries, 0 to 42707
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        42708 non-null  int64 
 1   director  42708 non-null  object
 2   actor     42708 non-null  object
dtypes: int64(1), object(2)
memory usage: 1001.1+ KB


In [13]:
credit_df.head()

Unnamed: 0,id,director,actor
0,862,John Lasseter,Tom Hanks
1,8844,Larry J. Franco,Robin Williams
2,15602,Howard Deutch,Walter Matthau
3,31357,Forest Whitaker,Whitney Houston
4,11862,Alan Silvestri,Steve Martin


- meta dataset

In [14]:
meta_df = pd.read_csv('movies_metadata.csv', low_memory=False)
meta_df = meta_df[['id', 'overview','title']]
meta_df['id'] = meta_df.id.astype(str).apply(lambda x : re.sub('[^0-9]+', '', x))
meta_df['id'] = meta_df.id.astype(int)
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        45466 non-null  int64 
 1   overview  44512 non-null  object
 2   title     45460 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [15]:
meta_df.isnull().sum()

id            0
overview    954
title         6
dtype: int64

In [16]:
meta_df.head(2)

Unnamed: 0,id,overview,title
0,862,"Led by Woody, Andy's toys live happily in his ...",Toy Story
1,8844,When siblings Judy and Peter discover an encha...,Jumanji


- credit dataset과 meta dataset의 병합

In [69]:
df = pd.merge(left=credit_df, right=meta_df, on='id')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42768 entries, 0 to 42767
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        42768 non-null  int64 
 1   director  42768 non-null  object
 2   actor     42768 non-null  object
 3   overview  41928 non-null  object
 4   title     42765 non-null  object
dtypes: int64(1), object(4)
memory usage: 2.0+ MB


In [70]:
df.dropna(inplace=True)     # how='any' 가 디폴트
df.reset_index(drop=True, inplace=True)
df = df.head(20000)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        20000 non-null  int64 
 1   director  20000 non-null  object
 2   actor     20000 non-null  object
 3   overview  20000 non-null  object
 4   title     20000 non-null  object
dtypes: int64(1), object(4)
memory usage: 781.4+ KB


In [71]:
# overview 컬럼에 director와 주연배우 이름을 add
df['overview'] = df.apply(lambda x : x['director'] + ' ' + x['actor'] + ' ' + x['overview'], axis=1)

In [72]:
df[['director','actor','overview']].tail()

Unnamed: 0,director,actor,overview
19995,Joan Micklin Silver,Shelley Duvall,"Joan Micklin Silver Shelley Duvall Bernice, a ..."
19996,Stacy Peralta,Tony Alva,Stacy Peralta Tony Alva When six teenage boys ...
19997,Emmanuel Lubezki,Ben Affleck,Emmanuel Lubezki Ben Affleck After falling in ...
19998,Brian Helgeland,Chadwick Boseman,Brian Helgeland Chadwick Boseman The powerful ...
19999,Veikko Aaltonen,Esko Salminen,Veikko Aaltonen Esko Salminen Film by Veikko A...


### TDM 변환

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvect = TfidfVectorizer(stop_words='english')
tfidf_matrix = tvect.fit_transform(df.overview)
tfidf_matrix.shape

(20000, 56077)

- 영화의 타이틀과 인덱스를 가진 시리즈

In [74]:
indices = pd.Series(df.index, index=df.title).drop_duplicates()
indices.head()

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

In [75]:
indices['Jumanji']

1

### 코사인 유사도를 이용해서 유사한 영화를 찾는 함수

In [76]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [77]:
cosine_sim.shape

(20000, 20000)

In [78]:
cosine_sim[1, :5]

array([0.0159842 , 1.        , 0.04614783, 0.022001  , 0.04205539])

In [79]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # 선택한 영화의 타이틀로부터 해당되는 인덱스를 받아옵니다. 이제 선택한 영화를 가지고 연산할 수 있습니다.
    idx = indices[title]

    # 모든 영화에 대해서 해당 영화와의 유사도를 구합니다.
    sim_scores = list(enumerate(cosine_sim[idx]))

    # 유사도에 따라 영화들을 정렬합니다.
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # 가장 유사한 10개의 영화를 받아옵니다.
    sim_scores = sim_scores[1:11]

    # 가장 유사한 10개의 영화의 인덱스를 받아옵니다.
    movie_indices = [i[0] for i in sim_scores]

    # 가장 유사한 10개의 영화의 제목을 리턴합니다.
    return df.title.iloc[movie_indices]

In [80]:
get_recommendations('The Dark Knight Rises')

12273                            The Dark Knight
148                               Batman Forever
1298                              Batman Returns
15172                 Batman: Under the Red Hood
578                                       Batman
9954                               Batman Begins
9083          Batman Beyond: Return of the Joker
17526                           Batman: Year One
19130    Batman: The Dark Knight Returns, Part 1
3038                Batman: Mask of the Phantasm
Name: title, dtype: object

In [81]:
get_recommendations('Casino')

16355                                      Italianamerican
17190                                        Street Scenes
1141                                            GoodFellas
7899                                    My Voyage to Italy
12526                                       Finding Amanda
5770                                    The King of Comedy
2931                                          Midnight Run
10679    A Personal Journey with Martin Scorsese Throug...
17183            American Boy: A Profile of: Steven Prince
18823                                     Dolan's Cadillac
Name: title, dtype: object

In [87]:
df.title[3000:3020]

3000                                     Any Given Sunday
3001                                      Man on the Moon
3002                                         Galaxy Quest
3003                              The Talented Mr. Ripley
3004                                          Next Friday
3005                                        The Hurricane
3006                                       Angela's Ashes
3007                                  Play It to the Bone
3008                                                Titus
3009    Mr. Death: The Rise and Fall of Fred A. Leucht...
3010                                    The Third Miracle
3011                                              Montana
3012                               Snow Falling on Cedars
3013                                    Girl, Interrupted
3014                                                Trans
3015                                          My Dog Skip
3016                                            Supernova
3017          

In [86]:
get_recommendations('The NeverEnding Story')

123      The Neverending Story III: Escape from Fantasia
2005          The Neverending Story II: The Next Chapter
6486                                           I, Madman
18288                                      Devil's Diary
7033                                  Allegro non troppo
14496                                         Me and Him
14424                                    Garbage Warrior
7032                             The Diary of Anne Frank
8996                                   A Murder of Crows
5622                                   Saturday the 14th
Name: title, dtype: object