In [2]:
import pandas

path = "polarity_dataframe.csv"

df = pandas.read_csv(path)
print(df.head())

review = df.review.tolist()
y = df.pos.tolist()

   Unnamed: 0                                             review  pos
0           0  when the haunting arrived in theaters , all i ...    0
1           1  movies can do the two big es very well : educa...    0
2           2   " america's sweethearts " has an intriguing p...    0
3           3  remember back in the mid 1990s when crime and ...    0
4           4  when i originally saw the trailer for " analyz...    0


It is needed to clean the data from unneeded words: "stop words", inintelligible words ("______", "0009f", ...), |words| <= 2,3.

In [13]:
def contains_underscore(word):
    return any(i == "_" for i in word)

def contains_number(word):
    return any(i.isdigit() for i in word)

def small_length(word):
    return len(word) <= 3

def remove_word(word):
    small = small_length(word)
    underscore = contains_underscore(word)
    number = contains_number(word)
    return small or underscore or number

# faire en sorte de retirer des review 
# les mots répondant True à "remove_word"
cleaned_review = []
for sentence in review:
    words = []
    for w in sentence.split(" "):
        w = w.strip()
        if not remove_word(w):
            words.append(w)
    cleaned_review.append(" ".join(words))
print("New sentence: \n{}".format(cleaned_review[0]))

New sentence: 
when haunting arrived theaters kept hearing about overdone special effects fact that very often unseen bumps night horror film scarier than those that face courtesy special effects while agree that this remake haunting goes overboard visual effects department don't think that they completely blame this movie's failure appears that some people have failed take into account that original haunting unseen terrors about scary dust bunny special effects story isn't least scary aren't going with very frightening movie thing that interested most about this movie caretaker this building played bruce dern dern always great even though have only about minutes screen time still most interesting element movie through seemingly endless albeit fairly impressive special effects kept wishing that this movie about dern's caretaker dimensional characters that populated cast never good sign when player best part movie liam neeson plays scientist conducting experiments fear decides best resu

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(cleaned_review)
y_train, y_test = train_test_split(y)
print(len(X_train))
print(len(X_test))

1500
500


In [10]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(stop_words="english")
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

X_test_dtm = vect.transform(X_test)
print(len(vect.get_feature_names()))
pandas.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())

33807


Unnamed: 0,aa,aaaaaaaaah,aaaaaaaahhhh,aaaaaah,aaaahhhs,aahs,aaliyah,aamir,aardman,aaron,...,zukovsky,zulu,zundel,zurg,zus,zweibel,zwick,zwigoff,zycie,zzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
%time nb.fit(X_train_dtm, y_train)

y_pred_class = nb.predict(X_test_dtm)

from sklearn import metrics

print(metrics.accuracy_score(y_test, y_pred_class==1))
print(pandas.DataFrame(metrics.confusion_matrix(y_test, y_pred_class==1), columns=["0","1"]))

y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
auc = metrics.roc_auc_score(y_test, y_pred_prob)

#print(y_pred_prob)
print(auc)

CPU times: user 7.39 ms, sys: 1.21 ms, total: 8.6 ms
Wall time: 6.48 ms
0.504
     0    1
0  128  122
1  126  124
0.537736


In [14]:
import sklearn
print(sklearn.__version__)

0.24.2
