In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Loading the training data set 

In [None]:
df = pd.read_csv('/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip', delimiter="\t")

In [None]:
df = df.drop(['id'], axis=1)
df.head()

In [None]:
df.info()

In [None]:
df.sentiment.value_counts()

Loading the test data set 

In [None]:
df1=pd.read_csv("/kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip",delimiter= "\t")
df1.head()

In [None]:
train_len=df['review'].apply(len)
train_len.describe()


In [None]:
test_len=df['review'].apply(len)
test_len.describe()

* We can see train and test data have statistical features 
* The mean words count is 1327 and std is 1005 words
* The character count seems to show similar distribution with word count

Visualizing the train and test data set 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
fig=plt.figure(figsize=(14,8))
fig.add_subplot(1,2,1)
sns.distplot((train_len),color='red')

fig.add_subplot(1,2,2)
sns.distplot((test_len),color='blue')

Spitting the review in the words 

In [None]:
df['word_n'] = df['review'].apply(lambda x : len(x.split(' ')))
df1["word_n"]=df1["review"].apply(lambda x : len(x.split(" ")))

In [None]:
fig=plt.figure(figsize=(14,6))
fig.add_subplot(1,2,1)
sns.distplot(df['word_n'],color='red')

fig.add_subplot(1,2,2)
sns.distplot(df1['word_n'],color='blue')

In [None]:
sns.countplot(df['sentiment'])

Creating a word cloud to see, the words which appear mostly

In [None]:
from wordcloud import WordCloud
cloud=WordCloud(width=800, height=600).generate(" ".join(df['review'])) 
# join function can help merge all words into one string. " " means space can be a seperator between words.
plt.figure(figsize=(16,10))
plt.imshow(cloud)
plt.axis('off')

Remoiving unwanted HTML tags such as **br**  which appears the maximum

In [None]:
import re
import json


Using regrex library, we can remove the **html** tags easily from the sentiments

In [None]:
TAG_RE = re.compile(r'<[^>]+>')

In [None]:
df['review']=df['review'].apply(lambda x:TAG_RE.sub('', x))
df1['review']=df1['review'].apply(lambda x: TAG_RE.sub('', x))

In [None]:
from wordcloud import WordCloud
cloud=WordCloud(width=800, height=600).generate(" ".join(df['review'])) 
# join function can help merge all words into one string. " " means space can be a seperator between words.
plt.figure(figsize=(16,10))
plt.imshow(cloud)
plt.axis('off')

From the Word CLoud , we can see that all the **html** tags are removed from the sentiments 

Keeping only alphabets in the review segment 

In [None]:
df['review']=df['review'].apply(lambda x: re.sub("[^a-zA-Z]"," ",x))
df1['review']=df1['review'].apply(lambda x: re.sub("[^a-zA-Z]"," ",x))

In [None]:
df1.sample(4)

In [None]:
df["review"].str.find("?").value_counts()

In [None]:
df['word_n_2'] = df['review'].apply(lambda x : len(x.split(' ')))
df1['word_n_2'] = df1['review'].apply(lambda x : len(x.split(' ')))

fig, axe = plt.subplots(1,1, figsize=(7,5))
sns.boxenplot(x=df['sentiment'], y=df['word_n_2'], data=df)

In [None]:
# from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english")) 
# lemmatizer = WordNetLemmatizer()

In [None]:
df["review"]=df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [None]:
df1["review"]=df1['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [None]:
test=df1.drop(["word_n","word_n_2","id"],axis=1)

In [None]:
X=df.drop(["word_n","word_n_2","sentiment"],axis=1)

In [None]:
X.head(3)

In [None]:
Y=df.drop(["word_n","word_n_2","review"],axis=1)

In [None]:
Y.head(3)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts= cv.fit_transform(df["review"])

In [None]:
text_counts

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
text_tf= tf.fit_transform(df['review'])

In [None]:
text_tf

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(text_tf,Y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train, y_train.values.ravel())
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted)*100)


In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,predicted)
cm

In [None]:
from sklearn.preprocessing import StandardScaler  # doctest: +SKIP
>>> scaler = StandardScaler(with_mean=False)  # doctest: +SKIP
>>> # Don't cheat - fit only on training data
>>> scaler.fit(X_train)  # doctest: +SKIP
>>> X_train = scaler.transform(X_train)  # doctest: +SKIP
>>> # apply same transformation to test data
>>> X_test = scaler.transform(X_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier().fit(X_train,y_train)
predict=dt.predict(X_test)
print("Decision Tree Accuracy:",metrics.accuracy_score(y_test, predict)*100)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
bg=GradientBoostingClassifier(random_state=0,n_estimators=200)
bg.fit(X_train,y_train.values.ravel())
y_pred=bg.predict(X_test)
print("Boosting Accuracy:",metrics.accuracy_score(y_test, y_pred)*100)

In [None]:
cm=confusion_matrix(y_test,y_pred)
cm