In [3]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
train_df = pd.read_csv('labeledTrainData.tsv', delimiter='\t')

In [8]:
test_df = pd.read_csv('testData.tsv', delimiter='\t')

In [12]:
STOPWORDS = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

In [14]:
def cleanup_review(raw_review):
    '''
        1. remove html tags
        2. remove numbers and other punctuation
        3. convert to lower case remove stopwords
        4. remove stopwords
        
    '''
    raw_review = BeautifulSoup(raw_review,'lxml').get_text()
    raw_review = re.sub("[^A-Za-z]", " ", raw_review)
    word_list = raw_review.lower().split()
    word_list = [word for word in word_list if not word in STOPWORDS]
    word_list = [lemmatizer.lemmatize(word) for word in word_list]
    return " ".join(word_list)

In [15]:
cleanup_review("10 spoons of sugar")

'spoon sugar'

In [16]:
cleaned_train_review = []
for review in train_df['review']:
    cleaned_train_review.append(cleanup_review(review))

In [17]:
cleaned_test_review =[]

In [18]:
cleaned_test_review =[] 
for review in test_df['review']:
    cleaned_test_review.append(cleanup_review(review))

In [25]:
len(cleaned_train_review[0])

1404

In [49]:
vectorizer  = CountVectorizer(max_features=6000,
                             stop_words=None)

In [50]:
train_features = vectorizer.fit_transform(cleaned_train_review)

In [51]:
X_train = pd.DataFrame(train_features.toarray())
y_train = train_df.sentiment

In [52]:
test_features = vectorizer.transform(cleaned_test_review)
X_test = pd.DataFrame(test_features.toarray())

In [53]:
from sklearn.linear_model import LogisticRegression

In [54]:
classifier = LogisticRegression(solver='liblinear', C=0.05)

In [55]:
classifier.fit(X_train,y_train)

LogisticRegression(C=0.05, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [56]:
from sklearn.metrics import accuracy_score
print("training set accuracy :", accuracy_score(y_train, classifier.predict(X_train)))

training set accuracy : 0.93372


In [57]:
y_final = classifier.predict(X_test)

In [58]:
submission =pd.DataFrame({'id': test_df['id'], 'sentiment':y_final})

In [59]:
submission.to_csv('submission.csv', index=False)