In [None]:
import pandas as pd
import numpy as np
import os

os.makedirs("aclImdb_v1", exist_ok=True)
basepath = r"C:\Users\rsocc\Downloads\aclImdb_v1\aclImdb"

labels = {'pos': 1, 'neg': 0}
df = pd.DataFrame()

for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = pd.concat([df, pd.DataFrame([[txt, labels[l]]], columns=['review', 'sentiment'])], ignore_index=True)


df.columns = ['review', 'sentiment']
df = df.sample(frac=1, random_state=0)  # Shuffle dataset
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')
df.head(3)
df.shape
#50k, 2

In [None]:
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stop = stopwords.words('english')
porter = PorterStemmer()

def preprocess_text(text):
    text = re.sub('<[^>]*>', '', text)  # Remove HTML
    text = re.sub(r'\W+', ' ', text.lower())  # Remove punctuation & lowercase
    return text

def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in tokenizer(text) if word not in stop]

df['review'] = df['review'].apply(preprocess_text)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None,
                        tokenizer=tokenizer_porter, stop_words=stop)

X = tfidf.fit_transform(df['review'].values)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'].values, test_size=0.5, random_state=0)

model = Pipeline([('vectorizer', tfidf), ('classifier', LogisticRegression(solver='liblinear', random_state=0))])
model.fit(X_train, y_train)

print('Test Accuracy: %.3f' % model.score(X_test, y_test))


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'classifier__C': [1.0, 10.0, 100.0], 'classifier__penalty': ['l1', 'l2']}
gs = GridSearchCV(model, param_grid, scoring='accuracy', cv=5, verbose=2)
gs.fit(X_train, y_train)

print('Best Parameters:', gs.best_params_)
print('Best Accuracy:', gs.best_score_)


In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer

vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer_porter)
clf = SGDClassifier(loss='log', random_state=1)

def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)  # Skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

doc_stream = stream_docs(path='movie_data.csv')
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=[0, 1])

# Evaluate on test set
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Online Learning Accuracy: %.3f' % clf.score(X_test, y_test))


In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english', max_df=.1, max_features=5000)
X = count.fit_transform(df['review'].values)

lda = LatentDirichletAllocation(n_components=10, random_state=123, learning_method='batch')
X_topics = lda.fit_transform(X)

feature_names = count.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i] for i in topic.argsort()[:-6:-1]]))


In [None]:
import joblib
joblib.dump(model, 'sentiment_model.pkl')
model = joblib.load('sentiment_model.pkl')
print(model.predict(["This movie was fantastic!"]))


In [None]:
#train model
import pickle

# Train your model (Logistic Regression, for example)
clf = train_your_model()  # Placeholder function

# Save model and stop words
dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)

pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'))
pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'))
