In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('./data/moviereviews.tsv', sep='\t')

In [3]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [4]:
len(df)

2000

In [8]:
# print(df['review'][2])

### 결측치 제거

In [10]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [11]:
df.dropna(inplace=True)

In [12]:
df.isnull().sum()

label     0
review    0
dtype: int64

In [13]:
mystring = 'hello'
empty = ' '

In [14]:
mystring.isspace()

False

In [15]:
empty.isspace()

True

In [16]:
blanks = []

# (indexx, label, review text)
for i,lb,rv in df.itertuples():
    if rv.isspace():
        blanks.append(i)

In [17]:
blanks

[57,
 71,
 147,
 151,
 283,
 307,
 313,
 323,
 343,
 351,
 427,
 501,
 633,
 675,
 815,
 851,
 977,
 1079,
 1299,
 1455,
 1493,
 1525,
 1531,
 1763,
 1851,
 1905,
 1993]

In [18]:
df.drop(blanks, inplace=True)

In [19]:
len(df)

1938

### 데이터 분리

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X = df['review']

In [22]:
y = df['label']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

### 파이프라인 생성

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [27]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC())])

In [28]:
text_clf.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

### 예측

In [29]:
predictions = text_clf.predict(X_test)

In [30]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [31]:
print(confusion_matrix(y_test,predictions))

[[235  47]
 [ 41 259]]


In [32]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.85      0.83      0.84       282
         pos       0.85      0.86      0.85       300

   micro avg       0.85      0.85      0.85       582
   macro avg       0.85      0.85      0.85       582
weighted avg       0.85      0.85      0.85       582



In [33]:
print(accuracy_score(y_test,predictions))

0.8487972508591065
