In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df=pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

# **1.Data Preprocessing**

In [None]:
df['sentiment'].replace({'positive':1,'negative':0},inplace=True)

In [None]:
df.head()

> # Removing HTML Tags

In [None]:
import re
def clean_html(text):
    clean=re.compile('<.*?>')
    return re.sub(clean,'',text)
df['review']=df['review'].apply(clean_html)

> # Converting everything to lower

In [None]:
def convert_lower(text):
    return text.lower()
df['review']=df['review'].apply(convert_lower)

> # Function to remove special characters

In [None]:
def remove_special(text):
    x=''
    
    for i in text:
        if i.isalnum():
            x=x+i
        else:
            x=x+' '
    return x
df['review']=df['review'].apply(remove_special)

> # Remove the stop words

In [None]:
import nltk
from nltk.corpus import stopwords
def remove_stopwords(text):
    x=[]
    for i in text.split():
        
        if i not in stopwords.words('english'):
            x.append(i)
    y=x[:]
    x.clear()
    return y
df['review']=df['review'].apply(remove_stopwords)

> # Perform **Stemming**

In [None]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
y=[]
def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    return z
df['review']=df['review'].apply(stem_words)

In [None]:
def join_back(list_input):
    return " ".join(list_input)
df['review']=df['review'].apply(join_back)

In [None]:
df

# **2. Count Vectorization**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=1000)
X=cv.fit_transform(df['review']).toarray()

In [None]:
X.shape

In [None]:
# taking out sentiment column into y 

y=df.iloc[:,-1].values

# **3.Building and Applying model**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
y_train.shape

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB


In [None]:
clf1=GaussianNB()
clf2=MultinomialNB()
clf3=BernoulliNB()

In [None]:
clf1.fit(X_train,y_train)
clf2.fit(X_train,y_train)
clf3.fit(X_train,y_train)


In [None]:
#testing
y_pred1=clf1.predict(X_test)
y_pred2=clf2.predict(X_test)
y_pred3=clf3.predict(X_test)

# **4.Model Evaluation and Confussion Matrix**

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [None]:
print("Gaussian",f1_score(y_test,y_pred1))
print("Multinomial",f1_score(y_test,y_pred2))
print("Bernoulli",f1_score(y_test,y_pred3))
#f1_score(y_test,y_pred1)

In [None]:
print("Gaussian",confusion_matrix(y_test,y_pred1))
print("Multinomial",confusion_matrix(y_test,y_pred2))
print("Bernoulli",confusion_matrix(y_test,y_pred3))

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
wc=WordCloud(background_color="white", contour_color="steelblue")
s=''
for i in range(1,50000):
    s=s+df['review'][i]
wc.generate(s)
plt.imshow(wc,interpolation="bilinear")
plt.axis("off")
plt.show()
#WordCloud(df['sentiment'])