# Importing Libraries

In [4]:
import pandas as pd
dataset = pd.read_csv('./train.csv', delimiter=',')

# Playing with Data 

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


The sentiment labels are:

0 - negative
1 - somewhat negative
2 - neutral
3 - somewhat positive
4 - positive


In [6]:
dataset.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [7]:
# Row count for each distinct Label
dataset.Sentiment.value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

# Processing Text data 

In [8]:
# Taking dataset["Phrase"] and removing special characters,stopwords,

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts = cv.fit_transform(dataset['Phrase'])

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset['Sentiment'], test_size=0.25, random_state=5)

# Generating our model

## Multionomial Naive Bayes

In [10]:
from sklearn.naive_bayes import MultinomialNB

In [11]:
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
from sklearn import metrics
predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)

In [13]:
predicted

array([2, 2, 2, ..., 4, 0, 3])

<39015x14991 sparse matrix of type '<class 'numpy.int64'>'
	with 149694 stored elements in Compressed Sparse Row format>

In [26]:
df2=pd.DataFrame(predicted,columns=['Label'])

In [35]:
df3=pd.concat([dataset['Phrase'],df2],axis=1,ignore_index=True)
df3.drop_duplicates()
df3.dropna()

Unnamed: 0,0,1
0,A series of escapades demonstrating the adage ...,2.0
1,A series of escapades demonstrating the adage ...,2.0
2,A series,2.0
3,A,3.0
4,series,2.0
...,...,...
39010,you do n't laugh,2.0
39011,do n't laugh,2.0
39012,", flee .",4.0
39013,flee .,0.0


In [14]:
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

60.25%


### Using Bigram


In [15]:
# using bigram

cv = CountVectorizer(stop_words='english', ngram_range = (2,2), tokenizer = token.tokenize)
text_counts = cv.fit_transform(dataset['Phrase'])

#from sklearn.model_selection import train_test_split()
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset['Sentiment'],test_size=0.25, random_state=5)


#Fitting the model
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

#Evaulating the model
#form sklearn import metrics
accuracy_score = metrics.accuracy_score(MNB.predict(X_test), Y_test)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

60.37%


### Using trigram

In [16]:

cv = CountVectorizer(stop_words='english', ngram_range = (3,3), tokenizer = token.tokenize)
text_counts = cv.fit_transform(dataset['Phrase'])
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset['Sentiment'],test_size=0.25, random_state=5)
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)
accuracy_score = metrics.accuracy_score(MNB.predict(X_test), Y_test)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

58.86%


## Complement Naive Bayes

In [17]:

#With this particular MNB model we are gaining success which is close to 60%, nomatter what n-gram vectorization we opt for.
#Let's try to change the model to ComplementNB. 

#let's write the complete code assuming we have our data imported to dataset.

#from sklearn.feature_extraction import CountVectorizer
#from nlkt.tokenize import RegexpTokenizer
#token = RegexpTokenixer(r'[A-Za-z0-9]+')
cv = CountVectorizer(stop_words='english', ngram_range=(1,1), tokenizer=token.tokenize)
text_count = cv.fit_transform(dataset['Phrase'])

#split the dataset in train test 
#form sklearn.model_selection() import train_test_split()
X_train, X_test, Y_train, Y_test = train_test_split(text_count, dataset['Sentiment'], test_size=0.25, random_state=2)

#Defining and compiling the model -> we will use ComplementNB
from sklearn.naive_bayes import ComplementNB

#Fitting the model
CNB = ComplementNB()
CNB.fit(X_train, Y_train)

#evaluating the model
#from sklearn import metrics
accuracy_score = metrics.accuracy_score(CNB.predict(X_test),Y_test)

print(str('{:4.2f}'.format(accuracy_score*100))+'%')

47.53%


In [18]:
# from sklearn.naive_bayes import GaussianNB
# GNB = GaussianNB()
# GNB.fit(X_train.todense(), Y_train)
# accuracy_score = metrics.accuracy_score(CNB.predict(X_test),Y_test)

# print('GNB accuracy = ' + str('{:4.2f}'.format(accuracy_score*100))+'%')

## Bernouli Naive Bayes

In [19]:
from sklearn.naive_bayes import BernoulliNB
BNB = BernoulliNB()
BNB.fit(X_train, Y_train)
accuracy_score_bnb = metrics.accuracy_score(BNB.predict(X_test),Y_test)
print('BNB accuracy = ' + str('{:4.2f}'.format(accuracy_score_bnb*100))+'%')

BNB accuracy = 60.61%


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
text_count_2 = tfidf.fit_transform(dataset['Phrase'])

#splitting the data in test and training
#from sklearn.model_selection() import train_test_split()
x_train, x_test, y_train, y_test = train_test_split(text_count_2, dataset['Sentiment'],test_size=0.25,random_state=5)

#defining the model
#compilimg the model -> we are going to use already used models GNB, MNB, CNB, BNB
#fitting the model
MNB.fit(x_train, y_train)
accuracy_score_mnb = metrics.accuracy_score(MNB.predict(x_test), y_test)
print('accuracy_score_mnb = '+str('{:4.2f}'.format(accuracy_score_mnb*100))+'%')

BNB.fit(x_train, y_train)
accuracy_score_bnb = metrics.accuracy_score(BNB.predict(x_test), y_test)
print('accuracy_score_bnb = '+str('{:4.2f}'.format(accuracy_score_bnb*100))+'%')

CNB.fit(x_train, y_train)
accuracy_score_cnb = metrics.accuracy_score(CNB.predict(x_test), y_test)
print('accuracy_score_cnb = '+str('{:4.2f}'.format(accuracy_score_cnb*100))+'%')

# GNB.fit(x_train.todense(), y_train)
# accuracy_score_gnb = metrics.accuracy_score(GNB.predict(x_test.todense()), y_test)
# print('accuracy_score_gnb = '+str('{:4.2f}'.format(accuracy_score_gnb*100))+'%')

accuracy_score_mnb = 58.50%
accuracy_score_bnb = 59.33%
accuracy_score_cnb = 51.42%


# Using Non Naive Bayes classifiers

In [21]:

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
SGDC = SGDClassifier()
LSVC = LinearSVC()

#on TF-IDF data
LSVC.fit(x_train, y_train)
accuracy_score_lsvc = metrics.accuracy_score(LSVC.predict(x_test), y_test)
print('accuracy_score_lsvc = '+str('{:4.2f}'.format(accuracy_score_lsvc*100))+'%')

SGDC.fit(x_train, y_train)
accuracy_score_sgdc = metrics.accuracy_score(SGDC.predict(x_test), y_test)
print('accuracy_score_sgdc = '+str('{:4.2f}'.format(accuracy_score_sgdc*100))+'%')

#on CountVectorize data
LSVC.fit(X_train, Y_train)
accuracy_score_lsvc_CV = metrics.accuracy_score(LSVC.predict(X_test), Y_test)
print('accuracy_score_lsvc_cv = '+str('{:4.2f}'.format(accuracy_score_lsvc_CV*100))+'%')

SGDC.fit(X_train, Y_train)
accuracy_score_sgdc_CV = metrics.accuracy_score(SGDC.predict(X_test), Y_test)
print('accuracy_score_sgdc_cv = '+str('{:4.2f}'.format(accuracy_score_sgdc_CV*100))+'%')


accuracy_score_lsvc = 63.88%
accuracy_score_sgdc = 56.41%




accuracy_score_lsvc_cv = 63.05%
accuracy_score_sgdc_cv = 60.31%
