In [None]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head()

In [None]:
df['sentiment'] = df['sentiment'].map({'positive':1,'negative':0})

In [None]:
df.sentiment.value_counts()

In [None]:
import string

In [None]:
def sentence_to_vec(s,embedding_dict,stopwords,tokenizer):
    words = str(s).lower()
    words = tokenizer(words)
    words = [word for word in words if word not in string.punctuation]
    words = [word for word in words if word not in stopwords]
    words = [word for word in words if word.isalpha()]
    
    M = []
    for w in words:
        if w in embedding_dict:
            M.append(embedding_dict[w])
            
    if len(M)==0:
        return np.zeros(300)
    M=np.array(M)
    v = M.sum(axis=0)
    return v/np.sqrt((v**2).sum())

In [None]:
def load_embeddings(file):
    f = open(file,'r')
    gloveModel = {}
    for line in f:
        splitLines = line.split()
        word = splitLines[0]
        wordEmbedding = np.array([float(value) for value in splitLines[1:]])
        gloveModel[word]=wordEmbedding        
    
    return gloveModel

In [None]:
print('Loading embeddings')
embeddings = load_embeddings('../input/glove-embeddings/glove.6B.300d.txt')

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
vectors = []
for review in df.review.values:
    vectors.append(sentence_to_vec(s= review,embedding_dict=embeddings,stopwords=[],tokenizer=word_tokenize))
    
    
vectors = np.array(vectors)
y = df.sentiment.values
print('Done')

In [None]:
x_train,x_test,y_train,y_test = train_test_split(vectors,y,test_size=0.2,stratify=y)

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [None]:
logreg.fit(x_train,y_train)

In [None]:
y_pred = logreg.predict(x_test)

In [None]:
print('Accuracy of the model : {:.3f}'.format(accuracy_score(y_test,y_pred)))