In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd

import glob
import os
import string

from html.parser import HTMLParser

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer

# **Формирование DataFrame**

In [3]:
train_pos_files = sorted(glob.glob(os.path.join("aclImdb/train/pos", ("*.txt"))))
train_neg_files = sorted(glob.glob(os.path.join("aclImdb/train/neg", ("*.txt"))))

test_pos_files = sorted(glob.glob(os.path.join("aclImdb/test/pos", ("*.txt"))))
test_neg_files = sorted(glob.glob(os.path.join("aclImdb/test/neg", ("*.txt"))))

In [4]:
print(f"train pos files count: {len(train_pos_files)}")
print(f"train neg files count: {len(train_neg_files)}")
print(f"test pos files count: {len(test_pos_files)}")
print(f"test neg files count: {len(test_neg_files)}")

train pos files count: 12500
train neg files count: 12500
test pos files count: 12500
test neg files count: 12500


In [5]:
pos_files = train_pos_files + test_pos_files
neg_files = train_neg_files + test_neg_files

In [6]:
pos_labels = np.full(25000, 1, dtype=int)
neg_labels = np.full(25000, 0, dtype=int)

In [7]:
pos_text = []

for file in pos_files:
    with open(file) as f:
        pos_text.append(f.read())

pos_text = np.array(pos_text)

In [8]:
neg_text = []

for file in neg_files:
    with open(file) as f:
        neg_text.append(f.read())

neg_text = np.array(neg_text)

In [9]:
reviews_df = pd.DataFrame({'sentiment': np.concatenate((pos_labels, neg_labels)),
                           'text': np.concatenate((pos_text, neg_text))})

In [10]:
reviews_df.head(3)

Unnamed: 0,sentiment,text
0,1,Bromwell High is a cartoon comedy. It ran at t...
1,1,Homelessness (or Houselessness as George Carli...
2,1,Brilliant over-acting by Lesley Ann Warren. Be...


In [11]:
reviews_df.tail(3)

Unnamed: 0,sentiment,text
49997,0,The basic genre is a thriller intercut with an...
49998,0,Four things intrigued me as to this film - fir...
49999,0,David Bryce's comments nearby are exceptionall...


# **Перемешивание DataFrame** 

In [12]:
reviews_df = reviews_df.iloc[np.random.permutation(reviews_df.index)]

In [13]:
reviews_df.head(10)

Unnamed: 0,sentiment,text
10885,1,"Funny, sexy, hot!!! There is no real plot but ..."
13700,1,The Falcon and the Snowman is based on a true ...
24770,1,"""Radio Flyer"" is one of my most loved American..."
6269,1,"Ang Lee clearly likes to ease into a film, to ..."
9175,1,WOW! What - a - movie !!!!!!!!!!! I'm not at a...
8996,1,"A very silly movie, this starts with a soft po..."
23774,1,Asterix and the Vikings is the first animated ...
904,1,Given the opposite circumstance of 2009 where ...
22792,1,After watching this film I experienced a new s...
12994,1,It plays like your usual teenage-audience T&A ...


In [14]:
reviews_df.tail(10)

Unnamed: 0,sentiment,text
44638,0,I am a big fan of British films in general but...
45334,0,"Alright, how someone can actually think this m..."
37788,0,"Although not a big Coen brothers fan, I am an ..."
22326,1,I think that Never Been Kissed was a totally a...
4290,1,This is widely viewed in Australia as one of t...
44773,0,Despite John Travolta's statements in intervie...
46114,0,The ABC gears up it's repertory company for an...
48738,0,I've never understood this type of spoof movie...
18079,1,"So it's not an award winner, so what? Have you..."
46637,0,"and generally speaking, you will eventually ha..."


# **Удаление html-разметки**

In [15]:
class ReviewsHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.text = ""
        
    def handle_data(self, data):
        self.text += " " + data
        
    def get_data(self):
        return self.text[1:]

In [16]:
def strip_tags(text):
    parser = ReviewsHTMLParser()
    parser.feed(text)
    return parser.get_data()

In [17]:
reviews_df["text"] = reviews_df["text"].apply(strip_tags)

# **Дополнительная очистка**

In [18]:
def clear_text(text):
    text = text.replace("\'", " ")
    return text

In [19]:
reviews_df["text"] = reviews_df["text"].apply(clear_text)

In [20]:
def remove_punctuation(text):
    return text.translate(str.maketrans("", "", string.punctuation))

In [21]:
reviews_df["text"] = reviews_df["text"].apply(remove_punctuation)

# **Разделение датасета**

In [22]:
y = reviews_df["sentiment"].to_numpy()
x = reviews_df["text"].to_numpy()

# **Применение nltk**

In [23]:
def stop_words_stemmer_apply(x, stop_words, st):
    for i in range(len(x)):
        filtered_sentence = [w for w in word_tokenize(x[i]) if not w.lower() in stop_words]
        stem_sentence = [st.stem(w) for w in filtered_sentence]
        x[i] = " ".join(stem_sentence)
    return x

In [24]:
stop_words = set(stopwords.words('english'))

In [25]:
st = LancasterStemmer()

In [26]:
x = stop_words_stemmer_apply(x, stop_words, st)

# **Классификация через LogisticRegression**

In [27]:
vectorizer = TfidfVectorizer()

In [28]:
x = vectorizer.fit_transform(x)
x.shape

(50000, 111855)

In [29]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [30]:
model = LogisticRegression()

scores = cross_val_score(model, x_train, y_train, cv=3)
print("Средняя точность: %0.2f" % scores.mean())

Средняя точность: 0.88


In [31]:
model.fit(x_train, y_train)

In [32]:
y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print("Точность предсказания на тестовой выборке: %0.2f" % accuracy)

Точность предсказания на тестовой выборке: 0.89
