In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [2]:
#importing the training data
train_data=pd.read_csv('C:/Users/sdandir/Desktop/projects/NLP/imdb_train.csv')
print(train_data.shape)
train_data.head(10)

(40000, 2)


Unnamed: 0,review,sentiment
0,We had STARZ free weekend and I switched on th...,negative
1,I'll admit that this isn't a great film. It pr...,negative
2,I finally found a version of Persuasion that I...,positive
3,The BBC surpassed themselves with the boundari...,positive
4,"Much praise has been lavished upon Farscape, b...",negative
5,"Of course the plot, script, and, especially ca...",positive
6,This is one of those road movies that would li...,negative
7,What an uninteresting hodge-podge. It could ha...,negative
8,Only a handful of the segments are engaging he...,negative
9,THE GREEN BUTCHERS (Anders Thomas Jensen - Den...,positive


In [3]:
#importing the training data
test_data=pd.read_csv('C:/Users/sdandir/Desktop/projects/NLP/imdb_test_data.csv')
print(test_data.shape)
test_data.head(10)

(10000, 2)


Unnamed: 0,id,review
0,1,Not only is this movie a great film for basic ...
1,2,"Waitress: Honey, here's them eggs you ordered...."
2,3,Many mystery stories follow the standard whodu...
3,4,A space ship cruising through the galaxy encou...
4,5,My favorite film this year. Great characters a...
5,6,"From the beginning of the movie, it gives the ..."
6,7,This is not the worst film I have seen of Pete...
7,8,Many consider BEAST STABLE to be the last of t...
8,9,This is a cult film for many reasons. First be...
9,10,This movie is horrible. Everything in it has b...


In [4]:
#Tokenization of text
tokenizer=ToktokTokenizer()
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

In [5]:
#Removing the html strips

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
#Apply function on review column
train_data['review']=train_data['review'].apply(denoise_text)
test_data['review']=test_data['review'].apply(denoise_text)

In [6]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
#Apply function on review column
train_data['review']=train_data['review'].apply(remove_special_characters)
test_data['review']=test_data['review'].apply(remove_special_characters)

In [7]:
#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text
#Apply function on review column
train_data['review']=train_data['review'].apply(simple_stemmer)
test_data['review']=test_data['review'].apply(simple_stemmer)

In [8]:
#set stopwords to english
stop=set(stopwords.words('english'))
print(stop)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#Apply function on review column
train_data['review']=train_data['review'].apply(remove_stopwords)
test_data['review']=test_data['review'].apply(remove_stopwords)

{'no', "you'd", "you've", 'some', 're', 'whom', 'were', 'weren', "weren't", 'how', 'off', 'the', 'again', "wouldn't", 'my', 'or', 'mightn', 'ma', 'ourselves', 'ours', 'but', 'above', 'why', 'once', 'which', 'd', 'y', 'haven', 'and', 'am', 'about', 'should', 'has', 'then', 'from', 'nor', 'being', 'here', 'aren', 've', 'shan', 'it', 'so', 'most', 'their', 'between', 'just', "hasn't", 'wasn', 'such', 'himself', 'needn', 'hasn', 'on', 'if', 'i', 'll', 'itself', 'didn', 'wouldn', "needn't", 'while', 'themselves', 'where', 'ain', 'isn', "should've", 'herself', 'when', 'him', 'there', 'same', 'does', 'with', 'too', 'over', 'these', 'been', 'further', 'of', "hadn't", 'theirs', 'do', "shouldn't", 'your', "it's", 'she', 'after', 'down', 'that', 'can', 'o', 'doesn', 'other', 'me', 'what', 'all', "mightn't", "haven't", 'are', 'doing', "don't", 'having', 'don', 'through', 'into', "aren't", 'very', 'did', 'both', 'shouldn', 'more', 'in', 'to', "that'll", 'under', 'our', 'during', 'before', 'against'

In [9]:
#normalized train reviews
norm_train_reviews=train_data.review[:40000]
norm_train_reviews[2]

'final found version persuas like ann doesnt look like sculleri maid thi version veri thin age pretti woman quit like describ book captain wentworth doesnt look like 50 doe look perpetu angri rather describ book hasnt age much ann quit handsom play part convict realismthat act believ creat real charact wa like charact book came life havent seen thi version urg find order request either bookstor librari must worth price worth wait watch 1995 version 2007 version thi one tower two whi isnt rate higher beyond comprehens book convey tender relationship thi movi make book come life'

In [10]:
#Normalized test reviews
norm_test_reviews=test_data.review[:10000]
norm_test_reviews[7005]

'today point view quit ridicul rate thi film 18 X US film ha sexual yet sublim erot stori tell pictur rather innoc throughout movi feel see spirit late 60 earli 70 fashion dialogu typic experiment cinematographi light thi exactli part make worth see'

In [11]:
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train_reviews=cv.fit_transform(norm_train_reviews)
#transformed test reviews
cv_test_reviews=cv.transform(norm_test_reviews)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)
#vocab=cv.get_feature_names()-toget feature names

BOW_cv_train: (40000, 6207362)
BOW_cv_test: (10000, 6207362)


In [18]:

sentiment_data=train_data['sentiment']
print(sentiment_data.shape)

(40000,)


In [None]:
#training the model
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
#Fitting the model for Bag of words
lr_bow=lr.fit(cv_train_reviews,sentiment_data)
print(lr_bow)

In [None]:
#Predicting the model for bag of words
lr_bow_predict=lr.predict(cv_test_reviews)
print(lr_bow_predict)

In [None]:
predictions = pd.DataFrame()
predictions.insert(loc = 0 , column='id',value=test_data['id'])
predictions.insert(loc=1, column='sentiment',value=lr_bow_predict)

In [None]:
predictions.to_csv('secondsub.csv')

In [12]:
import keras

In [13]:
from keras.utils import to_categorical
from keras import models
from keras import layers

In [14]:
import matplotlib
import matplotlib.pyplot as plt
