# 1. sepal_width 회귀식, 전체에 대한 RMSE값 구하기

In [376]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error

In [377]:
from sklearn.datasets import load_iris
iris = load_iris()

In [378]:
data = iris.data
label = iris.target
columns = iris.feature_names
df = pd.DataFrame(data, columns = columns)
df.head()
X = df['sepal width (cm)']

In [379]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, label, test_size=0.2, random_state=2020)

In [380]:
from sklearn.linear_model import LinearRegression
sim_lr = LinearRegression()

sim_lr.fit(X_train.values.reshape(-1, 1), y_train)

y_pred = sim_lr.predict(X_test.values.reshape(-1, 1))

In [381]:
w = float(sim_lr.coef_)
b = float(sim_lr.intercept_)

In [382]:
print('sepal width 회귀식 : y = {0:.4f}x +{1:.4f}'.format(w, b))
y_pred_mse = mean_squared_error(y_pred, y_test)
y_pred_rmse = np.sqrt(y_pred_mse)
print('sepal width RMSE : ', y_pred_rmse)

sepal width 회귀식 : y = -0.7616x +3.3497
sepal width RMSE :  0.7215377526423293


# 2. 폐암 데이터 SVM, 결정 트리, 로지스틱 회귀로 예측하고 정확도 구하기

In [389]:
df2 = pd.read_csv('./data/ThoraricSurgery.csv')
df2.head(3)

Unnamed: 0,293,1,3.8,2.8,0,0.1,0.2,0.3,0.4,0.5,12,0.6,0.7,0.8,1.1,0.9,62,0.10
0,1,2,2.88,2.16,1,0,0,0,1,1,14,0,0,0,1,0,60,0
1,8,2,3.19,2.5,1,0,0,0,1,0,11,0,0,1,1,0,66,1
2,14,2,3.98,3.06,2,0,0,0,1,1,14,0,0,0,1,0,80,1


In [390]:
label = df2['0.10']
feature = df2.drop(['0.10'], axis = 1, inplace = False)

In [391]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.2, random_state=2020)

## Support Vector Machine

In [411]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
svc = SVC()
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001, 0.00001, 10]}

# Make grid search classifier
clf_grid = GridSearchCV(svc, param_grid, verbose=1)

# Train the classifier
clf_grid.fit(X_train, y_train)
y_pred = clf_grid.predict(X_test)
print('정확도 : ', accuracy_score(y_test, y_pred))


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


정확도 :  0.8936170212765957


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    1.1s finished


## Decision Tree

In [346]:
from sklearn.tree import DecisionTreeClassifier
dcl = DecisionTreeClassifier()
dcl.fit(X_train, y_train)
y_pred= dcl.predict(X_test)
from sklearn.metrics import accuracy_score
print('정확도 : ', accuracy_score(y_test, y_pred))

정확도 :  0.7659574468085106


## Logistic Regression

In [348]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
lr_preds = lr_clf.predict(X_test)

print('정확도 : ', accuracy_score(y_test, y_pred))

정확도 :  0.7659574468085106


## 3. Iris 데이터 2차원으로 축소 후 k = 2,3,4에 대해 군집화 후 성능 평가

In [349]:
from sklearn.datasets import load_iris
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

iris = load_iris()
# transform pandas to DataFrame
columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
irisDF = pd.DataFrame(iris.data, columns=columns)
irisDF['target']=iris.target
irisDF.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0


In [350]:

from sklearn.preprocessing import StandardScaler

# 표준 정규 분포로 변환
iris_scaled = StandardScaler().fit_transform(irisDF.iloc[:, :-1])
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(iris_scaled)
iris_pca=pca.transform(iris_scaled)
# 데이터 값 확인
pca_columns=['pca_component_1', 'pca_component_2']
irisDF = pd.DataFrame(iris_pca, columns = pca_columns)
irisDF['target'] = iris.target
irisDF.head(3)

Unnamed: 0,pca_component_1,pca_component_2,target
0,-2.264703,0.480027,0
1,-2.080961,-0.674134,0
2,-2.364229,-0.341908,0


## K = 2 일때

In [101]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

kmeans=KMeans(n_clusters=2, init='k-means++', max_iter=300, random_state=2020)
kmeans.fit(irisDF)
irisDF['cluster'] = kmeans.labels_
score_samples = silhouette_samples(iris.data, irisDF['cluster'])

irisDF['silhouette_coeff'] = score_samples

average_score = silhouette_score(iris.data, irisDF['cluster'])
print("정확도 : ", average_score)

정확도 :  0.6867350732769776


## K = 3 일때

In [102]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

kmeans=KMeans(n_clusters=3, init='k-means++', max_iter=300, random_state=2020)
kmeans.fit(irisDF)
irisDF['cluster'] = kmeans.labels_
score_samples = silhouette_samples(iris.data, irisDF['cluster'])

irisDF['silhouette_coeff'] = score_samples

average_score = silhouette_score(iris.data, irisDF['cluster'])
print("정확도 : ", average_score)

정확도 :  0.5016261862378945


## K = 4 일때

In [103]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

kmeans=KMeans(n_clusters=4, init='k-means++', max_iter=300, random_state=2020)
kmeans.fit(irisDF)
irisDF['cluster'] = kmeans.labels_
score_samples = silhouette_samples(iris.data, irisDF['cluster'])

irisDF['silhouette_coeff'] = score_samples

average_score = silhouette_score(iris.data, irisDF['cluster'])
print("정확도 : ", average_score)

정확도 :  0.3601654241594211


## 4. 영화 감성 분석 

In [528]:
from konlpy.tag import *

hannanum = Hannanum()
kkma = Kkma()
komoran = Komoran()
okt = Okt()

In [529]:
review_df = pd.read_csv('./data/midway.tsv', header=0, sep="\t", quoting=3)
review_df

Unnamed: 0.1,Unnamed: 0,평점,일시,감상평
0,0,9,2019.12.31 09:48,미드웨이가 재밌으면 추천 백두산이 재밌으면 비추
1,1,10,2019.12.31 10:41,저 해전이 있었기에 우리나라 광복도 가능 했음
2,2,10,2019.12.31 09:38,백두산 상영관 대폭줄이고 미드웨이 상영관 대폭늘려라
3,3,10,2019.12.31 09:14,방금 개봉했는데 1점 준애는 뭐냐 ㅋㅋ 일본놈이냐? 이제 광고돌고 있을 시간이다.
4,4,10,2019.12.31 19:58,교과서에는 미드웨이 해전에서 미국이 승리했다고 한 줄로 서술되어 있지만 단순히 한 ...
...,...,...,...,...
4820,4820,1,2019.12.31 09:11,대만에서 10월31일 개봉했었는데.성공못했음. 미국에서도 성적이 별로구..그이유가 ...
4821,4821,2,2020.01.05 20:41,아니 이게 재밌다는 사람은 뭐야?;그냥 국뽕->미국뽕 이 차이밖에 없는데 이 영화가...
4822,4822,2,2020.01.01 23:40,시간이 아까워요. 엉성한 그래서 감동도 교훈도 없는...ㅉㅉ
4823,4823,4,2019.12.31 15:37,"진주만보다 낫다고해서봤는데대실망...천조국,왜놈들 국뽕영화"


In [530]:
review_df['평점'] = review_df['평점'].apply(lambda x : 1 if x > 7 else 0) 

In [531]:
class_df = review_df['평점']
feature_df = review_df['감상평']

In [532]:
feature_df.fillna('', inplace=True)

## 정규 표현식

In [533]:
for i in range (0, len(feature_df)):
    import re
    hangul = re.compile('[^ ㄱ-ㅣ가-힣]+') # 한글과 띄어쓰기를 제외한 모든 글자
    feature_df[i] = hangul.sub(' ', feature_df[i]) # 한글과 띄어쓰기를 제외한 모든 부분을 제거


In [534]:
X_train, X_test, y_train, y_test = train_test_split(feature_df, class_df, test_size=0.2, random_state=2020)

In [535]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

In [536]:
lr_clf = LogisticRegression(C=10)
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('감성 분석 정확도 : ', accuracy_score(y_test, pred))


감성 분석 정확도 :  0.8704663212435233


## 한나눔

In [517]:
for i in range (0, len(feature_df)):    
    feature_df[i] = hannanum.nouns(feature_df[i])
    feature_df[i] = ' '.join(feature_df[i])

In [518]:
X_train, X_test, y_train, y_test = train_test_split(feature_df, class_df, test_size=0.2, random_state=2020)

In [475]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

In [476]:
lr_clf = LogisticRegression(C=10)
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('감성 분석 정확도 : ', accuracy_score(y_test, pred))


감성 분석 정확도 :  0.8621761658031089


## 꼬꼬마

In [481]:
for i in range (0, len(feature_df)):    
    feature_df[i] = kkma.nouns(feature_df[i])
    feature_df[i] = ' '.join(feature_df[i])

In [482]:
X_train, X_test, y_train, y_test = train_test_split(feature_df, class_df, test_size=0.2, random_state=2020)

In [483]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

In [484]:
lr_clf = LogisticRegression(C=10)
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('감성 분석 정확도 : ', accuracy_score(y_test, pred))


감성 분석 정확도 :  0.8590673575129534


## 코로만

In [491]:
for i in range (0, len(feature_df)):    
    feature_df[i] = komoran.nouns(feature_df[i])
    feature_df[i] = ' '.join(feature_df[i])

In [492]:
X_train, X_test, y_train, y_test = train_test_split(feature_df, class_df, test_size=0.2, random_state=2020)

In [493]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

In [494]:
lr_clf = LogisticRegression(C=10)
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('감성 분석 정확도 : ', accuracy_score(y_test, pred))


감성 분석 정확도 :  0.8632124352331606


## okt

In [499]:
for i in range (0, len(feature_df)):    
    feature_df[i] = okt.nouns(feature_df[i])
    feature_df[i] = ' '.join(feature_df[i])

In [500]:
X_train, X_test, y_train, y_test = train_test_split(feature_df, class_df, test_size=0.2, random_state=2020)

In [501]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

In [502]:
lr_clf = LogisticRegression(C=10)
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('감성 분석 정확도 : ', accuracy_score(y_test, pred))


감성 분석 정확도 :  0.8683937823834197
