In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#read_datsets
train = pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip',sep='\t')
test = pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip',sep='\t')

In [None]:
#explore Data
train.head()

In [None]:
train.Sentiment.unique()

In [None]:
test.head()

In [None]:
#create Tf-Idf vector
from sklearn.feature_extraction.text import TfidfVectorizer
#An N-gram means a sequence of N words
#stop_word removes all the common keywords of the language specified .(only 'en' supported)
#strip_accents Remove accents and perform other character normalization during the preprocessing step
#min_df is the threshold for document frequency , words that occur < min_df in the document are ignored
#analyzer specifies if our features should be individual words or characters . 
#tokken_pattern specifies separators that will be used to as a basis to tokenize words.
tfv = TfidfVectorizer(min_df=3,  max_features=49748,
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

# Filling NaNs with empty string
train['Phrase'] = train['Phrase'].fillna('')
#Construct the required TF-IDF matrix by fitting and transforming the data
tfv_matrix = tfv.fit_transform(train['Phrase'])
tfv_matrix.shape

In [None]:
tfv_matrix

In [None]:
#split data 
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(tfv_matrix, train['Sentiment'], test_size=0.25, random_state=5)

In [None]:
X_train

In [None]:
test.head()

In [None]:
#using Naive bayes classifier for prediction
from sklearn.naive_bayes import MultinomialNB

In [None]:
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

In [None]:
from sklearn import metrics
predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)

In [None]:
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

In [None]:
test['Phrase'] = test['Phrase'].fillna('')
#Construct the required TF-IDF matrix by fitting and transforming the data
tfv_test_matrix = tfv.fit_transform(test['Phrase'])
tfv_test_matrix.shape

In [None]:
submission=MNB.predict(tfv_test_matrix)

In [None]:
submission

In [None]:
submission_1 = pd.DataFrame()
submission_1['PhraseId'] = test.PhraseId
submission_1['Sentiment'] = submission
submission_1.to_csv('Submission.csv', index=False)