In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
import re
import string
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
nltk.download('wordnet')

### Extracting the Data

In [None]:
df= pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

### Looking at the distribution of Sentiment

In [None]:
df.sentiment.value_counts().plot(kind='pie', autopct='%1.0f%%', colors=['red','green'])

### Pre Processing

#### Total Word Counts in each sentiment

In [None]:
df['Total_words']=[len(x.split())for  x in df['review']]

In [None]:
df

#### Using lambda function

In [None]:
df['word_counts']= df['review'].apply(lambda x: len(x.split(' ') ))

In [None]:
df.head()

#### Character count including spaces

In [None]:
df['character_count'] = [len(x) for x in df['review']]

In [None]:
df.head()

#### character count excluding spaces

In [None]:
df['character_count2'] = [len(''.join(x.split())) for x in df['review']]

In [None]:
#Using Lambda function

In [None]:
df['character_count3']= df['review'].apply(lambda x: len(''.join(x.split()) ))

In [None]:
df.head()

#### Conting the Upper case words

In [None]:
df['Upper_Case'] = df['review'].apply( lambda x : len([x for x in x.split() if x.isupper()]))

In [None]:
df.head()

#### Conting the Upper case words

In [None]:
df['Lower_Case'] = df['review'].apply( lambda x : len([x for x in x.split() if x.islower()]))

In [None]:
df.head()

#### Number of numerics:

In [None]:
df['Digit_count'] = df['review'].apply( lambda x : len([x for x in x.split() if x.isdigit()]))

In [None]:
df.head()

#### Average word count

In [None]:
df['average_word'] = df['character_count2']/df['word_counts']

In [None]:
df.head()

#### Number of stop Words

In [None]:
from nltk.corpus import stopwords
stop=stopwords.words('english')
df['stopwords'] = df['review'].apply(lambda x: len([x for x in x.split() if x in stop]))

In [None]:
df.head()

#### Basic Pre-Processing

#### Lower Case conversion

In [None]:
df['review'] = df['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [None]:
df.head()

#### Define function for removing special characters

In [None]:
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-Z0-9]'
    text=re.sub(pattern,' ',text)
    return text
#Apply function on review column
df['review']=df['review'].apply(remove_special_characters)

#### Removal of StopWords

In [None]:
df['review'] = df['review'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop))

#### Stemming the text

In [None]:
from nltk.stem import WordNetLemmatizer

lmtzr = WordNetLemmatizer()

df['review'] = df['review'].apply(lambda x: " ".join(lmtzr.lemmatize(x) for x in x.split()))

df['review'].head()

In [None]:
df.head()

#### Bag of Words

In [None]:
all_Words=[x for x in pd.Series(' '.join(df['review']).split())] 

In [None]:
nltk.FreqDist(all_Words).most_common(10)

In [None]:
freq = pd.Series(' '.join(df['review']).split()).value_counts()[:50]
freq

In [None]:
freq =['br','movie','film','one','get','would','make','see','much','first','way','could','go','know','two','like','even','say','ever','little','go','way','know','also','seem']

In [None]:
df['review'] = df['review'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
df['review'].head()

### Word CLoud

In [None]:
from wordcloud import WordCloud,STOPWORDS
new_df=df[df['sentiment']=='negative']
words = ' '.join(new_df['review'])

wordcloud = WordCloud(stopwords=stop,
                      background_color='black',
                      width=3000,
                      height=2500
                     ).generate(words)
plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
from wordcloud import WordCloud,STOPWORDS
new_df=df[df['sentiment']=='positive']
words = ' '.join(new_df['review'])

wordcloud = WordCloud(stopwords=stop,
                      background_color='black',
                      width=3000,
                      height=2500
                     ).generate(words)
plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
processed_features = df.iloc[:,0].values
labels = df.iloc[:, 1].values

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer (max_features=2500, min_df = 100, max_df = 1000)#(min_df = 0.2,max_df = 0.8)
processed_features = vectorizer.fit_transform(processed_features).toarray() 

In [None]:
processed_features

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=0)

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
DT_model = DecisionTreeClassifier()
DT_model.fit(X_train, y_train)

In [None]:
## Performance Matrix on train data set
from sklearn import metrics
y_train_predict = DT_model.predict(X_train)
model_score = DT_model.score(X_train, y_train)
print(model_score)
print(metrics.confusion_matrix(y_train, y_train_predict))
print(metrics.classification_report(y_train, y_train_predict))

In [None]:
## Performance Matrix on test data set
y_test_predict = DT_model.predict(X_test)
model_score = DT_model.score(X_test, y_test)
print(model_score)
print(metrics.confusion_matrix(y_test, y_test_predict))
print(metrics.classification_report(y_test, y_test_predict))

### Linear DIscriminant Analysis

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
LDA_model= LinearDiscriminantAnalysis()
LDA_model.fit(X_train, y_train)

In [None]:
## Performance Matrix on train data set
y_train_predict = LDA_model.predict(X_train)
model_score = LDA_model.score(X_train, y_train)
print(model_score)
print(metrics.confusion_matrix(y_train, y_train_predict))
print(metrics.classification_report(y_train, y_train_predict))

In [None]:
## Performance Matrix on test data set
y_test_predict = LDA_model.predict(X_test)
model_score = LDA_model.score(X_test, y_test)
print(model_score)
print(metrics.confusion_matrix(y_test, y_test_predict))
print(metrics.classification_report(y_test, y_test_predict))