In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

import re
from keras.preprocessing.text import text_to_word_sequence
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

#For ignoring warning
import warnings
warnings.filterwarnings('ignore', category = DeprecationWarning)

# Load Dataset

In [None]:
IMDB = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

# Data Overview

In [None]:
# viewing data
IMDB.head()

In [None]:
print(IMDB.shape)                 
print(IMDB.columns)

The IMDB dataset has 50000 rows and 2 columns.

In [None]:
IMDB.dtypes

**Data Description**:

review:    review of the movie

sentiment: sentiment for the review, positive or negative

In [None]:
IMDB.isnull().sum()

In [None]:
IMDB.describe(include = 'all')

In [None]:
sns.countplot(x='sentiment', data= IMDB)
print(IMDB.sentiment.value_counts())

The number of positive and negative sentiment is equal (25000 each) 

Lets see how the reviews are written. We can imagine it may have lot of punctuations, hyperlinks etc. 

In [None]:
print(IMDB["review"][3])
print(IMDB["review"][1000])

Lets clean the data

# Data Preprocessing

**Basic cleaning**

In [None]:
# remove "(<.*?>)" markup
IMDB['review'] = IMDB['review'].apply(lambda x: re.sub('(<.*?>)', ' ', x))

# remove punctuation marks 
IMDB['review'] = IMDB['review'].apply(lambda x: re.sub('[,\.!?:()"]', '', x))
    
# remove whitespace
IMDB['review'] = IMDB['review'].apply(lambda x: x.strip())

# remove all strings that contain a non-letter
IMDB['review'] = IMDB['review'].apply(lambda x: re.sub('[^a-zA-Z"]',' ',x))
    
# convert to lower
IMDB['review'] = IMDB['review'].apply(lambda x: x.lower())


**Tokenization**

In [None]:
words = IMDB['review'].apply(lambda x: text_to_word_sequence(x))

**Stop word removal**

In [None]:
stop_words = set(stopwords.words('english'))
filtered_words = words.apply(lambda x: [w for w in x if not w in stop_words])
IMDB['review'] = filtered_words.apply(lambda x: " ".join(x))

**Lemmatization**

In [None]:
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()
  
IMDB['review']=IMDB['review'].apply(lambda x: lemmatizer.lemmatize(x))



In [None]:
print(IMDB["review"][3])

Lets convert sentiment values to numeric values

In [None]:
IMDB.sentiment = IMDB.sentiment.apply(lambda x: 1 if x=='positive' else 0)


# Training model

**Pipeline:**

Scikit-learn Pipelines are simple yet very useful tool for managing machine learning workflows.

***advantages:***

* clean code
* few bugs

In [None]:
from sklearn.pipeline import make_pipeline
clf = make_pipeline(TfidfVectorizer(), SVC(kernel='linear'))

scores = cross_validate(clf, IMDB["review"], IMDB["sentiment"], scoring=['accuracy'], cv=5)