In [39]:
import pandas as pd

In [40]:
data = pd.read_csv(r'train.csv')

In [41]:
data.isna().sum()

text         0
sentiment    0
dtype: int64

In [42]:
data.head()

Unnamed: 0,text,sentiment
0,"Now, I won't deny that when I purchased this o...",neg
1,"The saddest thing about this ""tribute"" is that...",neg
2,Last night I decided to watch the prequel or s...,neg
3,I have to admit that i liked the first half of...,neg
4,I was not impressed about this film especially...,neg


In [43]:
data.sentiment.unique()

array(['neg', 'pos'], dtype=object)

In [44]:
enc_senti={'pos':1,'neg':0}

In [45]:
data['sentiment']=data['sentiment'].map(enc_senti)

In [46]:
data.duplicated().sum()

96

In [47]:
data.drop_duplicates(inplace=True)

In [48]:
data['text'][10]

"I'm guessing the writers have never read a book of any kind, much less a Dickens novel, and certainly not David Copperfield, and that they based their screenplay on another poorly written screenplay, possibly an adaptation of Copperfield, though just as likely anything else, from which they randomly discarded about a third of the pages and then shuffled the rest, along with some random pages from a screenplay that someone's eighth grade nephew had written for an English class, and for which he had received a failing grade. <br /><br />If the casting was a bad joke - e.g., Richards as Kramer playing Micawber - which it was, then the direction and acting were the poorly- delivered punch lines. Getting beyond Kramer as Micawber, if possible, Ham was such a complete ogre, hunch-back and all, that I was half expecting at some point to see him being pursued by an angry pitch-fork and torch wielding mob of villagers. Uriah was almost as much of a clown figure as Micawber. Mr. Murdstone evoke

In [49]:
import re

In [50]:
def clean_text(text):
    text=re.sub(r'\W', ' ',text)
    text=re.sub(r'\d+', ' ', text)
    text=re.sub(r'\s',' ', text)
    text=text.lower()
    return text

In [51]:
data['text']=data['text'].apply(clean_text)

In [52]:
data

Unnamed: 0,text,sentiment
0,now i won t deny that when i purchased this o...,0
1,the saddest thing about this tribute is that...,0
2,last night i decided to watch the prequel or s...,0
3,i have to admit that i liked the first half of...,0
4,i was not impressed about this film especially...,0
...,...,...
24995,this film is fun if your a person who likes a...,1
24996,after seeing this film i feel like i know just...,1
24997,first this deserves about stars due to actin...,0
24998,if you like films that ramble with little plot...,0


In [53]:
#step 1: removing stop words
#step 2: lemmatization
#step 3: vectorization

In [54]:
import nltk
from nltk.corpus import stopwords

In [55]:
stop_words = set(stopwords.words('english'))

In [57]:
data['text'] = data['text'].str.split()

In [58]:
data

Unnamed: 0,text,sentiment
0,"[now, i, won, t, deny, that, when, i, purchase...",0
1,"[the, saddest, thing, about, this, tribute, is...",0
2,"[last, night, i, decided, to, watch, the, preq...",0
3,"[i, have, to, admit, that, i, liked, the, firs...",0
4,"[i, was, not, impressed, about, this, film, es...",0
...,...,...
24995,"[this, film, is, fun, if, your, a, person, who...",1
24996,"[after, seeing, this, film, i, feel, like, i, ...",1
24997,"[first, this, deserves, about, stars, due, to,...",0
24998,"[if, you, like, films, that, ramble, with, lit...",0


In [59]:
from nltk.stem import PorterStemmer

In [60]:
stem = PorterStemmer()

In [61]:
data['text stemmed'] = data.text.apply(lambda x:[stem.stem(words) for words in x])

In [62]:
data

Unnamed: 0,text,sentiment,text stemmed
0,"[now, i, won, t, deny, that, when, i, purchase...",0,"[now, i, won, t, deni, that, when, i, purchas,..."
1,"[the, saddest, thing, about, this, tribute, is...",0,"[the, saddest, thing, about, thi, tribut, is, ..."
2,"[last, night, i, decided, to, watch, the, preq...",0,"[last, night, i, decid, to, watch, the, preque..."
3,"[i, have, to, admit, that, i, liked, the, firs...",0,"[i, have, to, admit, that, i, like, the, first..."
4,"[i, was, not, impressed, about, this, film, es...",0,"[i, wa, not, impress, about, thi, film, especi..."
...,...,...,...
24995,"[this, film, is, fun, if, your, a, person, who...",1,"[thi, film, is, fun, if, your, a, person, who,..."
24996,"[after, seeing, this, film, i, feel, like, i, ...",1,"[after, see, thi, film, i, feel, like, i, know..."
24997,"[first, this, deserves, about, stars, due, to,...",0,"[first, thi, deserv, about, star, due, to, act..."
24998,"[if, you, like, films, that, ramble, with, lit...",0,"[if, you, like, film, that, rambl, with, littl..."


In [70]:
data['text stemmed'] = data['text stemmed'].apply(lambda x: ' '.join(x))

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [71]:
cv = TfidfVectorizer(max_features=3000)

In [72]:
x = cv.fit_transform(data['text stemmed']).toarray()

In [73]:
y = data['sentiment']

In [74]:
from sklearn.model_selection import train_test_split

In [76]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [79]:
from sklearn.naive_bayes import MultinomialNB

In [80]:
model=MultinomialNB()

In [81]:
model.fit(X_train, y_train)

In [87]:
y_pred=model.predict(X_test)

In [88]:
y_pred

array([1, 1, 0, ..., 0, 1, 1], dtype=int64)

In [94]:
from sklearn.metrics import mean_squared_error, r2_score

In [92]:
mse = mean_squared_error(y_pred,y_test)
mse

0.16201565950612326

In [96]:
r2 = r2_score(y_pred, y_test)
r2

0.3510421479765852