# 1. Spam Mail 구분하기

## 파일 불러오기 및 확인

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('spam.csv', encoding='latin1')
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


In [2]:
df.v1.value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [3]:
df.v2[0][:120], df.v2[2][:120]

('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt r')

## 전처리

In [4]:
import re
df['v2'] = df.v2.apply(lambda x: re.sub('[^A-Za-z]', ' ', x))
df.v2[0][:120], df.v2[2][:120]

('Go until jurong point  crazy   Available only in bugis n great world la e buffet    Cine there got amore wat   ',
 'Free entry in   a wkly comp to win FA Cup final tkts   st May       Text FA to       to receive entry question std txt r')

## 학습용, 테스트용 분리

In [5]:
from sklearn.model_selection import  train_test_split

a_tr, a_te, b_tr, b_te = train_test_split(
    df.v2, df.v1, test_size = 0.2,
    stratify = df.v1, random_state = 2021
)
a_tr.shape, a_te.shape, b_tr.shape, b_te.shape

((4457,), (1115,), (4457,), (1115,))

## 텍스트 변환

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
cvect1 = CountVectorizer(stop_words='english')
cvect1.fit(a_tr)
a_tr_cv1 = cvect1.transform(a_tr)
a_te_cv1 = cvect1.transform(a_te)
a_tr_cv1.shape, a_te_cv1.shape

((4457, 6589), (1115, 6589))

In [8]:
cvect2 = CountVectorizer(stop_words='english', ngram_range=(1,3))
cvect2.fit(a_tr)
a_tr_cv2 = cvect1.transform(a_tr)
a_te_cv2 = cvect1.transform(a_te)
a_tr_cv2.shape, a_te_cv2.shape

((4457, 6589), (1115, 6589))

In [9]:
tvect1 = TfidfVectorizer(stop_words='english')
tvect1.fit(a_tr)
a_tr_tv1 = tvect1.transform(a_tr)
a_te_tv1 = tvect1.transform(a_te)
a_tr_tv1.shape, a_te_tv1.shape

((4457, 6589), (1115, 6589))

In [10]:
tvect2 = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
tvect2.fit(a_tr)
a_tr_tv2 = tvect2.transform(a_tr)
a_te_tv2 = tvect2.transform(a_te)
a_tr_tv2.shape, a_te_tv2.shape

((4457, 50806), (1115, 50806))

## 로지스틱 회귀 모델

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [12]:
lr1 = LogisticRegression(max_iter=500)
%time lr1.fit(a_tr_cv1, b_tr)

CPU times: user 69.5 ms, sys: 246 µs, total: 69.7 ms
Wall time: 71.5 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
pred1 = lr1.predict(a_te_cv1)
score1 = accuracy_score(b_te, pred1)
score1

0.9757847533632287

In [14]:
lr2 = LogisticRegression(max_iter=500)
%time lr2.fit(a_tr_cv2, b_tr)
pred2 = lr2.predict(a_te_cv2)
score2 = accuracy_score(b_te, pred2)
score2

CPU times: user 74.7 ms, sys: 989 µs, total: 75.7 ms
Wall time: 81.9 ms


0.9757847533632287

In [15]:
lr3 = LogisticRegression(max_iter=500)
%time lr3.fit(a_tr_tv1, b_tr)
pred3 = lr3.predict(a_te_tv1)
score3 = accuracy_score(b_te, pred3)
score3

CPU times: user 94.5 ms, sys: 965 µs, total: 95.5 ms
Wall time: 95.5 ms


0.95695067264574

In [16]:
lr4 = LogisticRegression(max_iter=500)
%time lr4.fit(a_tr_tv2, b_tr)
pred4 = lr4.predict(a_te_tv2)
score4 = accuracy_score(b_te, pred4)
score4

CPU times: user 416 ms, sys: 488 ms, total: 904 ms
Wall time: 530 ms


0.9434977578475336

## Support Vector Machine의 Classifier

In [17]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [18]:
svc1 = SVC()
svc1.fit(a_tr_cv1, b_tr)
pred5 = svc1.predict(a_te_cv1)
score5 = accuracy_score(b_te, pred5)
score5

0.9704035874439462

In [19]:
svc2 = SVC()
svc2.fit(a_tr_cv2, b_tr)
pred6 = svc2.predict(a_te_cv2)
score6 = accuracy_score(b_te, pred6)
score6

0.9704035874439462

In [20]:
svc3 = SVC()
svc3.fit(a_tr_tv1, b_tr)
pred7 = svc3.predict(a_te_tv1)
score7 = accuracy_score(b_te, pred7)
score7

0.9730941704035875

In [21]:
svc4 = SVC()
svc4.fit(a_tr_tv2, b_tr)
pred8 = svc4.predict(a_te_tv2)
score8 = accuracy_score(b_te, pred8)
score8

0.9713004484304932

# 2. Overview, 배우 감독으로 추천

In [22]:
!unzip movies_metadata.csv.zip
!unzip credits.csv.zip

Archive:  movies_metadata.csv.zip
  inflating: movies_metadata.csv     
Archive:  credits.csv.zip
  inflating: credits.csv             


In [23]:
import pandas as pd

mv_a = pd.read_csv('movies_metadata.csv', low_memory=False)
mv_a.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [24]:
mv_a = mv_a[['title', 'overview', 'id']]
mv_a.head(2)

Unnamed: 0,title,overview,id
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",862
1,Jumanji,When siblings Judy and Peter discover an encha...,8844


In [25]:
mv_b = pd.read_csv('credits.csv', low_memory=False)
mv_b.head(2)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
