In [None]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD as TSVD
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict

import spacy
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

In [None]:
data = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
train = data.iloc[:25000]
test = data.iloc[25000:]

In [None]:
train.shape, test.shape

In [None]:
train.head()

In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
# Converting the text to lowercase

train['review'] = train['review'].apply(lambda x: str(x).lower())

In [None]:
data.head()

### Contractions Expansion

In [None]:
!pip install contractions

In [None]:
import contractions

In [None]:
contractions_dict = contractions.contractions_dict
contractions_dict

In [None]:
def contraction_expansion(x):
    
    if type(x) is str:
        
        for key in contractions_dict:
            
            value = contractions_dict[key]
            
            x = x.replace(key, value)
            
        return x
    
    else:
        
        return x

In [None]:
train['review'] = train['review'].apply(lambda x: contraction_expansion(x))

In [None]:
train.head()

### Removing Emails

In [None]:
def remove_emails(x):
    
    email_pattern = re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)")
    
    return re.sub(email_pattern, '', x)

In [None]:
train['review'] = train['review'].apply(lambda x:remove_emails(x))

In [None]:
train.sample(5)

### Removing HTML Tags

In [None]:
train['review'] = train['review'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text().strip())

In [None]:
train.iloc[6005][0]

In [None]:
train.sample(5)

### Removing Special Characters

In [None]:
def RemoveSpecialChars(x):
    
    x = re.sub(r'[^\w ]+', "", x)
    x = ' '.join(x.split())
    return x

In [None]:
train['review'] = train['review'].apply(lambda x: RemoveSpecialChars(x))

In [None]:
train.sample(5)

In [None]:
train.iloc[6005][0]

### Lemmetization

In [None]:
def lemme(x):
    
    x = str(x)
    x_list = []
    doc = nlp(x)
    
    for token in doc:
        lemma = token.lemma_
        
        if lemma == '-PRON-' or lemma == 'be':
            lemma = token.text
            
        x_list.append(lemma)
        
    return ' '.join(x_list)

In [None]:
%%time
train['review'] = train['review'].apply(lambda x: lemme(x))

In [None]:
train.sample(5)

### Tokenization using Text Blob

### Removing Stop Words

In [None]:
stopwords

In [None]:
len(stopwords)

In [None]:
def RemoveStopWords(x):
    
    return ' '.join([word for word in x.split() if word not in stopwords])

In [None]:
x = train.iloc[6005][0]

In [None]:
# EXAMPLE CODE

print(x)
print()
print("length of x: ",len(x))

In [None]:
x1 = RemoveStopWords(x)
x1

In [None]:
len(x1)

In [None]:
%%time

train['review'] = train['review'].apply(lambda x: RemoveStopWords(x))

In [None]:
train.sample(5)

### Removing Rare Words

In [None]:
text = ' '.join(train['review'])

In [None]:
#text

In [None]:
len(text)

In [None]:
# Creating Frequency

text_series = pd.Series(text.split())

In [None]:
freq_comm = text_series.value_counts()

In [None]:
freq_comm

In [None]:
rare_words = freq_comm[-82000:-1]
'rockumentarie' in rare_words

In [None]:
rare_words

In [None]:
# Removing 82000 rare occuring words 

train['review'] = train['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in rare_words]))

In [None]:
train['review'].sample(5)

### Converting the Data into Vector

In [None]:
train['sentiment'].value_counts()

In [None]:
X = train['review']
y = train['sentiment']

In [None]:
tfidf = TfidfVectorizer()

In [None]:
X = tfidf.fit_transform(X)

In [None]:
X.shape

In [None]:
X

### Splitting Data into Training and Testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 4, stratify = y)

In [None]:
X_train.shape, X_test.shape

### Dimensionality reduction using Truncated Singular Value Decomposition

In [None]:
%%time

#tsvd = TSVD(n_components=10000, random_state=4)
#X_train_tsvd = tsvd.fit_transform(X_train)

In [None]:
#sum(tsvd.explained_variance_)

### Using SVC for Classification

In [None]:
#clf_svc = SVC()

In [None]:
%%time

#scores = cross_val_score(clf_svc, X_train, y_train, cv=6, n_jobs=-1)

In [None]:
#scores

### Using Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf_lr = LogisticRegression()

In [None]:
X_train

In [None]:
%%time

scores = cross_val_score(clf_lr, X_train, y_train, cv=10, n_jobs=4)

In [None]:
scores

In [None]:
scores.mean()

In [None]:
clf_lr.fit(X_train, y_train)

In [None]:
y_test_pred = clf_lr.predict(X_test)

In [None]:
print(classification_report(y_test, y_test_pred))

In [None]:
confusion_matrix(y_test, y_test_pred)

In [None]:
clf_lr.predict(tfidf.transform(['American Psycho deserved an Oscar, they were robbed']))

In [None]:
y_real_pred = clf_lr.predict(tfidf.transform(test['review']))

In [None]:
print(classification_report(test['sentiment'], y_real_pred))

In [None]:
clf_lr.predict(tfidf.transform(["What hell was that, it's a masterpiece"]))