In [1]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline

In [2]:
data = pd.read_csv("word2vec-nlp-tutorial/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

data.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [3]:
data['tidy_review'] = data['review'].str.replace("[^a-zA-Z]", " ")
data.head()

Unnamed: 0,id,sentiment,review,tidy_review
0,"""5814_8""",1,"""With all this stuff going down at the moment ...",With all this stuff going down at the moment ...
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ...",The Classic War of the Worlds by Timothy ...
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell...",The film starts with a manager Nicholas Bell...
3,"""3630_4""",0,"""It must be assumed that those who praised thi...",It must be assumed that those who praised thi...
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ...",Superbly trashy and wondrously unpretentious ...


In [11]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english')) 

data['tidy_review'] =  data['tidy_review'].apply(lambda x: ' '.join([w for w in word_tokenize(x) if w.lower() not in stop_words]))



In [12]:
data.head()

Unnamed: 0,id,sentiment,review,tidy_review
0,"""5814_8""",1,"""With all this stuff going down at the moment ...",stuff going moment MJ started listening music ...
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ...",Classic War Worlds Timothy Hines entertaining ...
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell...",film starts manager Nicholas Bell giving welco...
3,"""3630_4""",0,"""It must be assumed that those who praised thi...",must assumed praised film greatest filmed oper...
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ...",Superbly trashy wondrously unpretentious explo...


In [None]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_review = data['tidy_review'].apply(lambda x: x.split())

tokenized_review = tokenized_review.apply(lambda x: [stemmer.stem(i) for i in x])

for i in range(len(tokenized_review)):
    tokenized_review[i] = ' '.join(tokenized_review[i])

data['tidy_review'] = tokenized_review

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(data['tidy_review'])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

xtrain_tfidf, xvalid_tfidf, ytrain, yvalid = train_test_split(tfidf, data['sentiment'], random_state=42, test_size=0.25)

lreg = LogisticRegression()
lreg.fit(xtrain_tfidf, ytrain) # training the model

prediction = lreg.predict_proba(xvalid_tfidf) # predicting on the validation set
prediction_int = prediction[:,1] >= 0.25 # if prediction is greater than or equal to 0.3 than 1 else 0
prediction_int = prediction_int.astype(np.int)

f1_score(yvalid, prediction_int)

In [None]:
from sklearn import svm

svmClf = svm.SVC(C=5, gamma=1)
svmClf.fit(xtrain_tfidf, ytrain)
svmPrediction = svmClf.predict(xvalid_tfidf)
f1_score(yvalid, svmPrediction)