In [2]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
import contractions
from unidecode import unidecode

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from autocorrect import Speller

In [44]:
data = pd.read_csv("Train.csv")
data.head(2)

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0


In [32]:
data["text"][0]

'I grew up (b. 1965) watching and loving the Thunderbirds. All my mates at school watched. We played "Thunderbirds" before school, during lunch and after school. We all wanted to be Virgil or Scott. No one wanted to be Alan. Counting down from 5 became an art form. I took my children to see the movie hoping they would get a glimpse of what I loved as a child. How bitterly disappointing. The only high point was the snappy theme tune. Not that it could compare with the original score of the Thunderbirds. Thankfully early Saturday mornings one television channel still plays reruns of the series Gerry Anderson and his wife created. Jonatha Frakes should hand in his directors chair, his version was completely hopeless. A waste of film. Utter rubbish. A CGI remake may be acceptable but replacing marionettes with Homo sapiens subsp. sapiens was a huge error of judgment.'

In [45]:
# preprocessing
# 1 remove spaces, newlines
def remove_spaces(data):
    clean_text = data.replace('\\n',' ').replace('\t',' ').replace('\\',' ')
    return clean_text

# 2 contraction mapping
def expand_text(data):
    expanded_text = contractions.fix(data)
    return expanded_text

# 3 handling accented characters
def handling_accented(data):
    fixed_text = unidecode(data)
    return fixed_text

# 4 cleaning
stopword_list = stopwords.words("english")
stopword_list.remove("no")
stopword_list.remove("nor")
stopword_list.remove("not")

def clean_data(data):
    tokens = word_tokenize(data)
    clean_text = [word.lower() for word in tokens if (word not in punctuation) and (word.lower() not in stopword_list) and (len(word)>2) and (word.isalpha())]
    return clean_text

# 5 autocorrection
def autocorrection(data):
    spell = Speller(lang = 'en')
    corrected_text = spell(data)
    return corrected_text

# 6 Lemmatization
def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    final_data = []
    for word in data:
        lemmatized_word = lemmatizer.lemmatize(word)
        final_data.append(lemmatized_word)
    return " ".join(final_data)

In [39]:
# data splitting

In [40]:
x_train,x_test,y_train,y_test = train_test_split(data.text,data.label,test_size= 0.25,random_state=42)

In [46]:
clean_text_train = x_train.apply(remove_spaces)
clean_text_test = x_test.apply(remove_spaces)

clean_text_train = clean_text_train.apply(expand_text)
clean_text_test = clean_text_test.apply(expand_text)

clean_text_train = clean_text_train.apply(handling_accented)
clean_text_test = clean_text_test.apply(handling_accented)

clean_text_train = clean_text_train.apply(clean_data)
clean_text_test = clean_text_test.apply(clean_data)

clean_text_train = clean_text_train.apply(lemmatization)
clean_text_test = clean_text_test.apply(lemmatization)

In [47]:
clean_text_train

27434    firstly huge fan crap film grade always good l...
13895    not much film big budget child television far ...
38835    delightful film accompanied composer rachel po...
30654    like keep review short simple pretty much sum ...
12278    remember hearing movie played nearly every the...
                               ...                        
31962    film grab opening scene never let watch indulg...
23452    not masterpiece like godfather not purpose mov...
23775    jefferey dahmer one sick guy not much say not ...
37135    unfortunately showing star movie thailand last...
27098    johnny dangerously fall completely hit miss ca...
Name: text, Length: 30000, dtype: object

In [64]:
# count vectorizer
count = CountVectorizer(max_features = 1000,max_df = 0.95)
count_val_train = count.fit_transform(clean_text_train)
count_val_test = count.transform(clean_text_test)

In [65]:
count_val_train

<30000x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 1621967 stored elements in Compressed Sparse Row format>

In [66]:
count_val_test

<10000x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 542915 stored elements in Compressed Sparse Row format>

In [67]:
count_val_train.A

array([[0, 0, 1, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [68]:
count_val_test.A

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [69]:
count_val_train.toarray()

array([[0, 0, 1, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [70]:
x = count.get_feature_names_out()
x

array(['ability', 'able', 'absolutely', 'accent', 'across', 'act',
       'acted', 'acting', 'action', 'actor', 'actress', 'actual',
       'actually', 'adaptation', 'add', 'admit', 'adult', 'adventure',
       'age', 'ago', 'agree', 'air', 'alien', 'alive', 'almost', 'alone',
       'along', 'already', 'also', 'although', 'always', 'amazing',
       'america', 'american', 'among', 'amount', 'animal', 'animation',
       'annoying', 'another', 'answer', 'anyone', 'anything', 'anyway',
       'apart', 'apparently', 'appear', 'appearance', 'appears',
       'appreciate', 'army', 'around', 'art', 'aside', 'ask', 'aspect',
       'atmosphere', 'attack', 'attempt', 'attention', 'audience',
       'average', 'avoid', 'award', 'away', 'awesome', 'awful', 'baby',
       'back', 'background', 'bad', 'badly', 'band', 'barely', 'based',
       'basic', 'basically', 'battle', 'beautiful', 'beauty', 'became',
       'become', 'becomes', 'bed', 'begin', 'beginning', 'behind',
       'belief', 'belie

In [71]:
len(x)

1000

In [72]:
# building dataframe
pd.DataFrame(count_val_train.A,columns = x)

Unnamed: 0,ability,able,absolutely,accent,across,act,acted,acting,action,actor,...,wrong,wrote,yeah,year,yes,yet,york,young,younger,zombie
0,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,2
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
29996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
29998,0,0,0,3,0,1,1,3,0,2,...,0,0,0,1,1,0,0,0,0,0


In [73]:
# model building
count_mnb = MultinomialNB()
count_mnb.fit(count_val_train.A,y_train)

In [74]:
predict_count_test = count_mnb.predict(count_val_test.A)

In [75]:
accuracy_count_test = accuracy_score(y_test,predict_count_test)
accuracy_count_test

0.8312

In [76]:
predict_count_train = count_mnb.predict(count_val_train.A)

In [78]:
accuracy_count_train = accuracy_score(y_train,predict_count_train)
accuracy_count_train

0.8362333333333334

In [79]:
# tfidf
tfidf = TfidfVectorizer(max_features=1000,max_df = 0.95)
tfidf_train = tfidf.fit_transform(clean_text_train)
tfidf_test = tfidf.transform(clean_text_test)

In [80]:
tfidf_train

<30000x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 1621967 stored elements in Compressed Sparse Row format>

In [81]:
tfidf_test

<10000x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 542915 stored elements in Compressed Sparse Row format>

In [82]:
tfidf_train.A

array([[0.        , 0.        , 0.12435322, ..., 0.        , 0.        ,
        0.33884721],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.10010432, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [83]:
tfidf_test.A

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.11644498, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [84]:
y = tfidf.get_feature_names_out()
y

array(['ability', 'able', 'absolutely', 'accent', 'across', 'act',
       'acted', 'acting', 'action', 'actor', 'actress', 'actual',
       'actually', 'adaptation', 'add', 'admit', 'adult', 'adventure',
       'age', 'ago', 'agree', 'air', 'alien', 'alive', 'almost', 'alone',
       'along', 'already', 'also', 'although', 'always', 'amazing',
       'america', 'american', 'among', 'amount', 'animal', 'animation',
       'annoying', 'another', 'answer', 'anyone', 'anything', 'anyway',
       'apart', 'apparently', 'appear', 'appearance', 'appears',
       'appreciate', 'army', 'around', 'art', 'aside', 'ask', 'aspect',
       'atmosphere', 'attack', 'attempt', 'attention', 'audience',
       'average', 'avoid', 'award', 'away', 'awesome', 'awful', 'baby',
       'back', 'background', 'bad', 'badly', 'band', 'barely', 'based',
       'basic', 'basically', 'battle', 'beautiful', 'beauty', 'became',
       'become', 'becomes', 'bed', 'begin', 'beginning', 'behind',
       'belief', 'belie

In [85]:
len(y)

1000

In [86]:
pd.DataFrame(tfidf_train.A,columns=y)

Unnamed: 0,ability,able,absolutely,accent,across,act,acted,acting,action,actor,...,wrong,wrote,yeah,year,yes,yet,york,young,younger,zombie
0,0.0,0.0,0.124353,0.00000,0.000000,0.000000,0.000000,0.081714,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.338847
1,0.0,0.0,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000
2,0.0,0.0,0.000000,0.00000,0.110929,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000
3,0.0,0.0,0.000000,0.00000,0.000000,0.000000,0.000000,0.080611,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000
4,0.0,0.0,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0.0,0.0,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.086966,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.108392,0.0,0.000000
29996,0.0,0.0,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000
29997,0.0,0.0,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.082646,0.000000,0.0,0.0,0.100104,0.0,0.000000
29998,0.0,0.0,0.000000,0.28796,0.000000,0.077717,0.094257,0.151350,0.0,0.103207,...,0.0,0.0,0.0,0.053101,0.078563,0.0,0.0,0.000000,0.0,0.000000


In [87]:
# Build model
tfidf_mnb = MultinomialNB()
tfidf_mnb.fit(tfidf_train.A,y_train)

In [88]:
predict_tfidf_test = tfidf_mnb.predict(tfidf_test)

In [89]:
accuracy_tfidf_test = accuracy_score(y_test,predict_tfidf_test)
accuracy_tfidf_test

0.8403

In [90]:
predict_tfidf_train = tfidf_mnb.predict(tfidf_train)

In [91]:
accuracy_tfidf_train = accuracy_score(y_train,predict_tfidf_train)
accuracy_tfidf_train

0.8428666666666667

#### ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

#### ngrams

In [92]:
from nltk.util import ngrams

In [93]:
def splitting_dataframe(data):
    token = data.split()
    return token
data = clean_text_test.apply(splitting_dataframe)
data

5406     [slasher, well, like, horror, definitely, one,...
29810    [based, elmore, leonard, violent, intelligent,...
9966     [fourth, feature, film, marc, recha, third, ma...
27442    [famous, movie, subject, freudian, analysis, p...
8522     [former, submariner, one, worst, submarine, mo...
                               ...                        
28667    [one, best, film, seen, year, not, gwyneth, pa...
4983     [movie, really, nothing, besides, admittedly, ...
11444    [action, western, james, steart, lead, star, c...
74       [picking, along, rest, marx, brother, box, set...
17915    [film, funny, film, violence, bad, acting, wel...
Name: text, Length: 10000, dtype: object

In [94]:
data2 = clean_text_train.apply(splitting_dataframe)
data2

27434    [firstly, huge, fan, crap, film, grade, always...
13895    [not, much, film, big, budget, child, televisi...
38835    [delightful, film, accompanied, composer, rach...
30654    [like, keep, review, short, simple, pretty, mu...
12278    [remember, hearing, movie, played, nearly, eve...
                               ...                        
31962    [film, grab, opening, scene, never, let, watch...
23452    [not, masterpiece, like, godfather, not, purpo...
23775    [jefferey, dahmer, one, sick, guy, not, much, ...
37135    [unfortunately, showing, star, movie, thailand...
27098    [johnny, dangerously, fall, completely, hit, m...
Name: text, Length: 30000, dtype: object

In [95]:
def ngram_list(data,ngram_range):
    ngram = ngrams(data,ngram_range)
    ngram_list1 = []
    for ngram1 in ngram:
        ngram_list1.append(" ".join(ngram1))
    return ngram_list1


In [96]:
unigrams = data.apply(lambda x : ngram_list(x,1))
unigrams

5406     [slasher, well, like, horror, definitely, one,...
29810    [based, elmore, leonard, violent, intelligent,...
9966     [fourth, feature, film, marc, recha, third, ma...
27442    [famous, movie, subject, freudian, analysis, p...
8522     [former, submariner, one, worst, submarine, mo...
                               ...                        
28667    [one, best, film, seen, year, not, gwyneth, pa...
4983     [movie, really, nothing, besides, admittedly, ...
11444    [action, western, james, steart, lead, star, c...
74       [picking, along, rest, marx, brother, box, set...
17915    [film, funny, film, violence, bad, acting, wel...
Name: text, Length: 10000, dtype: object

In [97]:
bigrams = data.apply(lambda x : ngram_list(x,2))
bigrams

5406     [slasher well, well like, like horror, horror ...
29810    [based elmore, elmore leonard, leonard violent...
9966     [fourth feature, feature film, film marc, marc...
27442    [famous movie, movie subject, subject freudian...
8522     [former submariner, submariner one, one worst,...
                               ...                        
28667    [one best, best film, film seen, seen year, ye...
4983     [movie really, really nothing, nothing besides...
11444    [action western, western james, james steart, ...
74       [picking along, along rest, rest marx, marx br...
17915    [film funny, funny film, film violence, violen...
Name: text, Length: 10000, dtype: object

In [98]:
trigrams = data.apply(lambda x : ngram_list(x,3))
trigrams

5406     [slasher well like, well like horror, like hor...
29810    [based elmore leonard, elmore leonard violent,...
9966     [fourth feature film, feature film marc, film ...
27442    [famous movie subject, movie subject freudian,...
8522     [former submariner one, submariner one worst, ...
                               ...                        
28667    [one best film, best film seen, film seen year...
4983     [movie really nothing, really nothing besides,...
11444    [action western james, western james steart, j...
74       [picking along rest, along rest marx, rest mar...
17915    [film funny film, funny film violence, film vi...
Name: text, Length: 10000, dtype: object

In [100]:
quadragrams = data.apply(lambda x : ngram_list(x,4))
quadragrams

5406     [slasher well like horror, well like horror de...
29810    [based elmore leonard violent, elmore leonard ...
9966     [fourth feature film marc, feature film marc r...
27442    [famous movie subject freudian, movie subject ...
8522     [former submariner one worst, submariner one w...
                               ...                        
28667    [one best film seen, best film seen year, film...
4983     [movie really nothing besides, really nothing ...
11444    [action western james steart, western james st...
74       [picking along rest marx, along rest marx brot...
17915    [film funny film violence, funny film violence...
Name: text, Length: 10000, dtype: object

In [101]:
pentaagrams = data.apply(lambda x : ngram_list(x,5))
pentaagrams

5406     [slasher well like horror definitely, well lik...
29810    [based elmore leonard violent intelligent, elm...
9966     [fourth feature film marc recha, feature film ...
27442    [famous movie subject freudian analysis, movie...
8522     [former submariner one worst submarine, submar...
                               ...                        
28667    [one best film seen year, best film seen year ...
4983     [movie really nothing besides admittedly, real...
11444    [action western james steart lead, western jam...
74       [picking along rest marx brother, along rest m...
17915    [film funny film violence bad, funny film viol...
Name: text, Length: 10000, dtype: object

In [103]:
hexagrams = data.apply(lambda x : ngram_list(x,6))
hexagrams

5406     [slasher well like horror definitely one, well...
29810    [based elmore leonard violent intelligent acti...
9966     [fourth feature film marc recha third, feature...
27442    [famous movie subject freudian analysis posses...
8522     [former submariner one worst submarine movie, ...
                               ...                        
28667    [one best film seen year not, best film seen y...
4983     [movie really nothing besides admittedly serie...
11444    [action western james steart lead star, wester...
74       [picking along rest marx brother box, along re...
17915    [film funny film violence bad acting, funny fi...
Name: text, Length: 10000, dtype: object

In [107]:
heptagrams = data.apply(lambda x : ngram_list(x,7))
heptagrams[5406]

['slasher well like horror definitely one see',
 'well like horror definitely one see otherwise',
 'like horror definitely one see otherwise not',
 'horror definitely one see otherwise not even',
 'definitely one see otherwise not even completely',
 'one see otherwise not even completely obvious',
 'see otherwise not even completely obvious film',
 'otherwise not even completely obvious film extremely',
 'not even completely obvious film extremely low',
 'even completely obvious film extremely low budget',
 'completely obvious film extremely low budget instance',
 'obvious film extremely low budget instance look',
 'film extremely low budget instance look entire',
 'extremely low budget instance look entire film',
 'low budget instance look entire film shot',
 'budget instance look entire film shot warehouse',
 'instance look entire film shot warehouse somewhere',
 'look entire film shot warehouse somewhere numerous',
 'entire film shot warehouse somewhere numerous occasion',
 'film sh

In [None]:
heptagrams = data.apply(lambda x : ngram_list(x,7))
heptagrams[5406]