In [1]:
import pandas as pd
import nltk
import string
pd.options.mode.chained_assignment = None

from bs4 import BeautifulSoup
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
test = pd. read_csv('test.csv',header=None, delimiter=',', skiprows=1, names=['text','label'])
test.shape

(5000, 2)

In [3]:
td = test[["text"]]
td["text"] = td["text"].astype(str)
td.head()

Unnamed: 0,text
0,I always wrote this series off as being a comp...
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...
2,This movie was so poorly written and directed ...
3,The most interesting thing about Miryang (Secr...
4,"when i first read about ""berlin am meer"" i did..."


In [4]:
td["low"] = td["text"].str.lower()
print(td["low"])

0       i always wrote this series off as being a comp...
1       1st watched 12/7/2002 - 3 out of 10(dir-steve ...
2       this movie was so poorly written and directed ...
3       the most interesting thing about miryang (secr...
4       when i first read about "berlin am meer" i did...
5       i saw this film on september 1st, 2005 in indi...
6       i saw a screening of this movie last night. i ...
7       william hurt may not be an american matinee id...
8       it is a piece of crap! not funny at all. durin...
9       i'm bout it(1997)<br /><br />developed & publi...
10      i had a recent spectator experience with the p...
11      i really enjoyed the detail that went into the...
12      didn't the writer for this movie see the other...
13      this movie was really bad. first they didn't e...
14      i think i watched a highly edited version beca...
15      uwe boll has done the impossible: create a gam...
16      i felt asleep, watching it!!! (and i had ticke...
17      brass 

In [5]:
def remove_html(text):
    return BeautifulSoup(text, "lxml").text

td["text_no_html"] = td["low"].apply(lambda text: remove_html(text))
print(td["text_no_html"])

0       i always wrote this series off as being a comp...
1       1st watched 12/7/2002 - 3 out of 10(dir-steve ...
2       this movie was so poorly written and directed ...
3       the most interesting thing about miryang (secr...
4       when i first read about "berlin am meer" i did...
5       i saw this film on september 1st, 2005 in indi...
6       i saw a screening of this movie last night. i ...
7       william hurt may not be an american matinee id...
8       it is a piece of crap! not funny at all. durin...
9       i'm bout it(1997)developed & published by no l...
10      i had a recent spectator experience with the p...
11      i really enjoyed the detail that went into the...
12      didn't the writer for this movie see the other...
13      this movie was really bad. first they didn't e...
14      i think i watched a highly edited version beca...
15      uwe boll has done the impossible: create a gam...
16      i felt asleep, watching it!!! (and i had ticke...
17      brass 

In [6]:
PUNCT_TO_REMOVE = string.punctuation

def remove_punct(text):
    return text.translate(str.maketrans(' ', ' ', PUNCT_TO_REMOVE))

td["wo_punct"] = td["text_no_html"].apply(lambda text: remove_punct(text))
print(td["wo_punct"])

0       i always wrote this series off as being a comp...
1       1st watched 1272002  3 out of 10dirsteve purce...
2       this movie was so poorly written and directed ...
3       the most interesting thing about miryang secre...
4       when i first read about berlin am meer i didnt...
5       i saw this film on september 1st 2005 in india...
6       i saw a screening of this movie last night i h...
7       william hurt may not be an american matinee id...
8       it is a piece of crap not funny at all during ...
9       im bout it1997developed  published by no limit...
10      i had a recent spectator experience with the p...
11      i really enjoyed the detail that went into the...
12      didnt the writer for this movie see the other ...
13      this movie was really bad first they didnt eve...
14      i think i watched a highly edited version beca...
15      uwe boll has done the impossible create a game...
16      i felt asleep watching it and i had tickets fo...
17      brass 

In [7]:
def remove_numbers(text):
    
    return " ".join([line for line in str(text).split() if not line.isdigit()])

td["text_wo_numbers"] = td["wo_punct"].apply(lambda text: remove_numbers(text))
print(td["text_wo_numbers"])

0       i always wrote this series off as being a comp...
1       1st watched out of 10dirsteve purcell typical ...
2       this movie was so poorly written and directed ...
3       the most interesting thing about miryang secre...
4       when i first read about berlin am meer i didnt...
5       i saw this film on september 1st in indianapol...
6       i saw a screening of this movie last night i h...
7       william hurt may not be an american matinee id...
8       it is a piece of crap not funny at all during ...
9       im bout it1997developed published by no limit ...
10      i had a recent spectator experience with the p...
11      i really enjoyed the detail that went into the...
12      didnt the writer for this movie see the other ...
13      this movie was really bad first they didnt eve...
14      i think i watched a highly edited version beca...
15      uwe boll has done the impossible create a game...
16      i felt asleep watching it and i had tickets fo...
17      brass 

In [9]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [10]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

td["text_wo_stop"] = td["text_wo_numbers"].apply(lambda text: remove_stopwords(text))
print(td["text_wo_stop"])

0       always wrote series complete stinkfest jim bel...
1       1st watched 10dirsteve purcell typical mary ka...
2       movie poorly written directed fell asleep minu...
3       interesting thing miryang secret sunshine acto...
4       first read berlin meer didnt expect much thoug...
5       saw film september 1st indianapolis one judges...
6       saw screening movie last night high expectatio...
7       william hurt may american matinee idol anymore...
8       piece crap funny whole movie nothing ever happ...
9       im bout it1997developed published limit filmsp...
10      recent spectator experience perfect witness ne...
11      really enjoyed detail went scriptjonathan rhys...
12      didnt writer movie see three loved original th...
13      movie really bad first didnt even follow facts...
14      think watched highly edited version wasnt near...
15      uwe boll done impossible create game adaptatio...
16      felt asleep watching tickets midnight premiere...
17      brass 

In [11]:
lemmatizerr = WordNetLemmatizer()
wordnet_mapp = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizerr.lemmatize(word, wordnet_mapp.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

td["text_lemmatized"] = td["text_wo_stop"].apply(lambda text: lemmatize_words(text))
print(td["text_lemmatized"])

0       always write series complete stinkfest jim bel...
1       1st watch 10dirsteve purcell typical mary kate...
2       movie poorly write direct fell asleep minute m...
3       interesting thing miryang secret sunshine acto...
4       first read berlin meer didnt expect much think...
5       saw film september 1st indianapolis one judge ...
6       saw screen movie last night high expectation g...
7       william hurt may american matinee idol anymore...
8       piece crap funny whole movie nothing ever happ...
9       im bout it1997developed publish limit filmspro...
10      recent spectator experience perfect witness ne...
11      really enjoy detail go scriptjonathan rhys mye...
12      didnt writer movie see three love original tho...
13      movie really bad first didnt even follow fact ...
14      think watch highly edit version wasnt nearly g...
15      uwe boll do impossible create game adaptation ...
16      felt asleep watch ticket midnight premiere que...
17      brass 

In [12]:
td.head()

Unnamed: 0,text,low,text_no_html,wo_punct,text_wo_numbers,text_wo_stop,text_lemmatized
0,I always wrote this series off as being a comp...,i always wrote this series off as being a comp...,i always wrote this series off as being a comp...,i always wrote this series off as being a comp...,i always wrote this series off as being a comp...,always wrote series complete stinkfest jim bel...,always write series complete stinkfest jim bel...
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,1st watched 12/7/2002 - 3 out of 10(dir-steve ...,1st watched 12/7/2002 - 3 out of 10(dir-steve ...,1st watched 1272002 3 out of 10dirsteve purce...,1st watched out of 10dirsteve purcell typical ...,1st watched 10dirsteve purcell typical mary ka...,1st watch 10dirsteve purcell typical mary kate...
2,This movie was so poorly written and directed ...,this movie was so poorly written and directed ...,this movie was so poorly written and directed ...,this movie was so poorly written and directed ...,this movie was so poorly written and directed ...,movie poorly written directed fell asleep minu...,movie poorly write direct fell asleep minute m...
3,The most interesting thing about Miryang (Secr...,the most interesting thing about miryang (secr...,the most interesting thing about miryang (secr...,the most interesting thing about miryang secre...,the most interesting thing about miryang secre...,interesting thing miryang secret sunshine acto...,interesting thing miryang secret sunshine acto...
4,"when i first read about ""berlin am meer"" i did...","when i first read about ""berlin am meer"" i did...","when i first read about ""berlin am meer"" i did...",when i first read about berlin am meer i didnt...,when i first read about berlin am meer i didnt...,first read berlin meer didnt expect much thoug...,first read berlin meer didnt expect much think...


In [13]:
td.count()

text               5000
low                5000
text_no_html       5000
wo_punct           5000
text_wo_numbers    5000
text_wo_stop       5000
text_lemmatized    5000
dtype: int64

In [36]:
vectorizer1 = TfidfVectorizer()
fitted_vectorizer1 = vectorizer1.fit(td["text_lemmatized"])

In [37]:
test_transform1 = fitted_vectorizer1.transform(td['text_lemmatized'])
y1 = test['label']
print(test_transform1)
y1

  (0, 48010)	0.06043756944066193
  (0, 47687)	0.030738587650194354
  (0, 47495)	0.03598358606990154
  (0, 47431)	0.024972414432856284
  (0, 47192)	0.04413376212687471
  (0, 47157)	0.037058139271328276
  (0, 47087)	0.03656822570360571
  (0, 46949)	0.051425956223090714
  (0, 46746)	0.04754585992231799
  (0, 46654)	0.06248870871295027
  (0, 46637)	0.055427892460792064
  (0, 46444)	0.023872172447025542
  (0, 46278)	0.027629342304935713
  (0, 46175)	0.02383212786195967
  (0, 46103)	0.09426298165079507
  (0, 45971)	0.09167610170089428
  (0, 45968)	0.049690450422914534
  (0, 45596)	0.04353528623542807
  (0, 45189)	0.09759804995453245
  (0, 44011)	0.12003116548513927
  (0, 43972)	0.03533680264842042
  (0, 43881)	0.031219894600455215
  (0, 43485)	0.060281957763964035
  (0, 43344)	0.055633081814687804
  (0, 42865)	0.07052429178564197
  :	:
  (4999, 25022)	0.10075017052152242
  (4999, 24588)	0.037809405780131485
  (4999, 24104)	0.12983614116150416
  (4999, 23875)	0.13114998260087973
  (4999, 2198

0       0
1       0
2       0
3       1
4       0
5       1
6       0
7       1
8       0
9       0
10      0
11      1
12      0
13      0
14      0
15      0
16      0
17      0
18      1
19      1
20      1
21      0
22      1
23      1
24      0
25      1
26      1
27      0
28      0
29      0
       ..
4970    0
4971    0
4972    0
4973    1
4974    1
4975    1
4976    1
4977    1
4978    0
4979    0
4980    1
4981    0
4982    1
4983    1
4984    1
4985    0
4986    0
4987    1
4988    0
4989    1
4990    1
4991    1
4992    0
4993    1
4994    1
4995    1
4996    1
4997    0
4998    0
4999    0
Name: label, Length: 5000, dtype: int64

In [38]:
X_test1 = test_transform1
y_test1 = y1
scikit_log_reg1 = LogisticRegression(solver='lbfgs')

In [39]:
model1 = scikit_log_reg1.fit(X_test1, y_test1)

In [40]:
predictions1 = model1.predict(X_test1)

In [41]:
print(accuracy_score(y_test1, predictions1))

0.9468


In [53]:
vectorizer2 = TfidfVectorizer(max_df=0.8, min_df=0.2, ngram_range=(1,3))
fitted_vectorizer2 = vectorizer2.fit(td["text_lemmatized"])

In [54]:
test_transform2 = fitted_vectorizer2.transform(td['text_lemmatized'])
y2 = test['label']
print(test_transform2)
y2

  (0, 42)	0.12812610688017342
  (0, 41)	0.12248108914872294
  (0, 40)	0.14175802162459103
  (0, 39)	0.1222756321713883
  (0, 35)	0.14252181684945595
  (0, 33)	0.439639847511629
  (0, 30)	0.13834771475881846
  (0, 29)	0.13142168907426552
  (0, 27)	0.2950629759888938
  (0, 25)	0.09385176125172605
  (0, 21)	0.2994676559847528
  (0, 19)	0.41477461268718996
  (0, 18)	0.10377075332120153
  (0, 16)	0.13745861396190892
  (0, 15)	0.3360973154079523
  (0, 12)	0.33344102845517426
  (0, 8)	0.12464676796162438
  (0, 7)	0.13961521523488898
  (0, 5)	0.1460857287940705
  (0, 3)	0.125542833956407
  (1, 41)	0.2012984905828157
  (1, 40)	0.46596051650694936
  (1, 39)	0.20096082066409912
  (1, 33)	0.24085034492328924
  (1, 32)	0.34175553057322927
  :	:
  (4997, 23)	0.19691056124547815
  (4997, 21)	0.22791948816458893
  (4997, 12)	0.2537760156056614
  (4997, 8)	0.2845992913167574
  (4997, 5)	0.3335497226778047
  (4998, 42)	0.5263615507401117
  (4998, 25)	0.38555732157185785
  (4998, 23)	0.5314397120977368
 

0       0
1       0
2       0
3       1
4       0
5       1
6       0
7       1
8       0
9       0
10      0
11      1
12      0
13      0
14      0
15      0
16      0
17      0
18      1
19      1
20      1
21      0
22      1
23      1
24      0
25      1
26      1
27      0
28      0
29      0
       ..
4970    0
4971    0
4972    0
4973    1
4974    1
4975    1
4976    1
4977    1
4978    0
4979    0
4980    1
4981    0
4982    1
4983    1
4984    1
4985    0
4986    0
4987    1
4988    0
4989    1
4990    1
4991    1
4992    0
4993    1
4994    1
4995    1
4996    1
4997    0
4998    0
4999    0
Name: label, Length: 5000, dtype: int64

In [55]:
X_test2 = test_transform2
y_test2 = y2
scikit_log_reg2 = LogisticRegression(solver='lbfgs')

In [56]:
model2 = scikit_log_reg2.fit(X_test2, y_test2)

In [57]:
predictions2 = model2.predict(X_test2)

In [58]:
print(accuracy_score(y_test2, predictions2))

0.7242


In [59]:
X_train1, X_test3, y_train1, y_test3 = train_test_split(test_transform1, y1, test_size=0.3)
scikit_log_reg3 = LogisticRegression(solver='lbfgs')

In [60]:
model3 = scikit_log_reg3.fit(X_train1, y_train1)

In [61]:
predictions3 = model3.predict(X_test3)

In [62]:
print(accuracy_score(y_test3, predictions3))

0.862


In [63]:
X_train2, X_test4, y_train2, y_test4 = train_test_split(test_transform2, y2, test_size=0.3)
scikit_log_reg4 = LogisticRegression(solver='lbfgs')

In [64]:
model4 = scikit_log_reg4.fit(X_train2, y_train2)

In [65]:
predictions4 = model4.predict(X_test4)

In [66]:
print(accuracy_score(y_test4, predictions4))

0.7166666666666667
