In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('moviereviews.tsv', sep='\t')
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [2]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [3]:
df.dropna(inplace=True)

len(df)

1965

In [4]:
blanks = []  # start with an empty list

for i,lb,rv in df.itertuples():  # iterate over the DataFrame
    if type(rv)==str:            # avoid NaN values
        if rv.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list

print(len(blanks), 'blanks: ', blanks)

27 blanks:  [57, 71, 147, 151, 283, 307, 313, 323, 343, 351, 427, 501, 633, 675, 815, 851, 977, 1079, 1299, 1455, 1493, 1525, 1531, 1763, 1851, 1905, 1993]


In [5]:
df.drop(blanks, inplace=True)
len(df)

1938

In [6]:
df['label'].value_counts()

label
neg    969
pos    969
Name: count, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split

X = df['review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

In [9]:
text_clf_lsvc.fit(X_train, y_train)

In [10]:
# Form a prediction set
predictions = text_clf_lsvc.predict(X_test)

In [11]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[259  49]
 [ 49 283]]


In [12]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.84      0.84      0.84       308
         pos       0.85      0.85      0.85       332

    accuracy                           0.85       640
   macro avg       0.85      0.85      0.85       640
weighted avg       0.85      0.85      0.85       640



In [13]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.846875


In [16]:
your_sentence = "Tthe script becomes so unspeakably bad that the best line poor lee evens can utter after another"

predicted_sentiment = text_clf_lsvc.predict([your_sentence])
# Print the predicted sentiment
print("Predicted Sentiment:", predicted_sentiment[0])

Predicted Sentiment: neg
