In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')
sample_submission = pd.read_csv('dataset/sample_submission.csv')

In [3]:
train.head()

Unnamed: 0,id,document,label
0,1,영상이나 음악이 이쁘다 해도 미화시킨 불륜일뿐,0
1,2,히치콕이 이 영화를 봤다면 분명 박수를 쳤을듯...,1
2,3,괜찮은 음악영화가 또 나왔군요!!! 따뜻한 겨울이 될 것 같아요~,1
3,4,아무래도 20년도지난작품이라 지금보기는너무유치하다,0
4,5,지금까지의 영화들이 그랬듯. 이 영화역시 일본에 대한 미화는 여전하다.,0


In [4]:
train['document'] = train['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)

In [5]:
train.head()

Unnamed: 0,id,document,label
0,1,영상이나 음악이 이쁘다 해도 미화시킨 불륜일뿐,0
1,2,히치콕이 이 영화를 봤다면 분명 박수를 쳤을듯,1
2,3,괜찮은 음악영화가 또 나왔군요 따뜻한 겨울이 될 것 같아요,1
3,4,아무래도 년도지난작품이라 지금보기는너무유치하다,0
4,5,지금까지의 영화들이 그랬듯 이 영화역시 일본에 대한 미화는 여전하다,0


In [6]:
test['document'] = test['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", regex = True)

In [7]:
test.head()

Unnamed: 0,id,document
0,1,시간 때우기 좋은 영화 지루함
1,2,훈훈한 정이 느껴지는 영화 가족끼리 드라마 보듯이 보면 딱
2,3,
3,4,멋있는 영화입니다 잊을 수 없는
4,5,너무 감동적이네요 펑펑 울었습니다


In [8]:
# tf-idf 벡터화
for max_features in range(100, 10000, 100):
    vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", sublinear_tf=True, ngram_range=(1, 3), max_features=max_features)
    
    X = train['document']
    y = np.array(train.label)
    
    X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, random_state=2)
    
    X_train = vectorizer.fit_transform(X_train)
    X_eval = vectorizer.transform(X_eval)
    lgs = LogisticRegression(class_weight='balanced')
    lgs.fit(X_train, y_train)
    predicted = lgs.predict(X_eval)
    print(f"max_features: {max_features}, Accuracy: {lgs.score(X_eval, y_eval)}" )

max_features: 100, Accuracy: 0.715
max_features: 200, Accuracy: 0.749
max_features: 300, Accuracy: 0.785
max_features: 400, Accuracy: 0.801
max_features: 500, Accuracy: 0.81
max_features: 600, Accuracy: 0.822
max_features: 700, Accuracy: 0.84
max_features: 800, Accuracy: 0.833
max_features: 900, Accuracy: 0.836
max_features: 1000, Accuracy: 0.844
max_features: 1100, Accuracy: 0.85
max_features: 1200, Accuracy: 0.848
max_features: 1300, Accuracy: 0.852
max_features: 1400, Accuracy: 0.852
max_features: 1500, Accuracy: 0.849
max_features: 1600, Accuracy: 0.859
max_features: 1700, Accuracy: 0.86
max_features: 1800, Accuracy: 0.859
max_features: 1900, Accuracy: 0.859
max_features: 2000, Accuracy: 0.86
max_features: 2100, Accuracy: 0.858
max_features: 2200, Accuracy: 0.859
max_features: 2300, Accuracy: 0.865
max_features: 2400, Accuracy: 0.867
max_features: 2500, Accuracy: 0.865
max_features: 2600, Accuracy: 0.863
max_features: 2700, Accuracy: 0.859
max_features: 2800, Accuracy: 0.862
max_fe

In [14]:
#  9900일 때 가장 큰 정확도 0.881 를 얻었습니다.
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", sublinear_tf=True, ngram_range=(1,3), max_features=5100)

X = train['document']
y = np.array(train.label)

# 벡터화
X_train = vectorizer.fit_transform(X)
X_test = vectorizer.transform(test['document'])

# 로지스틱 회귀
lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y)
predicted = lgs.predict(X_test)

# 왜 똑같은 걸 두번..? 
X_train = vectorizer.fit_transform(train['document'])
X_test = vectorizer.transform(test['document'])
y = np.array(train.label)

lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y)
predicted = lgs.predict(X_test)

In [15]:
sample_submission = pd.read_csv('dataset/sample_submission.csv')
sample_submission.loc[:, 'label'] = predicted

sample_submission.to_csv('sample_submission_day2_logistic3.csv', index=False)

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

In [17]:
pip install tensorflow

Collecting tensorflowNote: you may need to restart the kernel to use updated packages.

  Downloading tensorflow-2.7.0-cp38-cp38-win_amd64.whl (430.8 MB)
Collecting astunparse>=1.6.0
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting flatbuffers<3.0,>=1.12
  Downloading flatbuffers-2.0-py2.py3-none-any.whl (26 kB)
Collecting tensorflow-estimator<2.8,~=2.7.0rc0
  Downloading tensorflow_estimator-2.7.0-py2.py3-none-any.whl (463 kB)
Collecting gast<0.5.0,>=0.2.1
  Using cached gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting opt-einsum>=2.3.2
  Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
Collecting keras<2.8,>=2.7.0rc0
  Downloading keras-2.7.0-py2.py3-none-any.whl (1.3 MB)
Collecting google-pasta>=0.1.1
  Using cached google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting libclang>=9.0.1
  Downloading libclang-12.0.0-2-py2.py3-none-win_amd64.whl (13.0 MB)
Collecting absl-py>=0.4.0
  Downloading absl_py-1.0.0-py3-none-any.whl (126 kB)
Collecting tensorflow-io-