In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"
from sklearn.model_selection import train_test_split

# Common imports
import numpy as np
import pandas as pd
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
import nltk
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.tokenize import word_tokenize
from nltk.corpus.reader.wordnet import *
import re
import contractions
import string
from sklearn.metrics import classification_report

In [2]:
data_train = pd.read_csv("train.csv",encoding='ISO-8859–1')
data_test = pd.read_csv("test.csv",encoding='ISO-8859–1')
chat_expressions = pd.read_csv('chat_expressions.csv', sep=',')
chat_expressions_dict = dict(zip(chat_expressions.Chat_Words, chat_expressions.Chat_Words_Extended))

In [3]:
# analyze outputs
data_train['target'].value_counts()
inactive = len(data_train[data_train['target'] == 0])
active = len(data_train[data_train['target'] == 1])
class_distribution_ratio = inactive/active
print(class_distribution_ratio)
data_train['target'].value_counts()/len(data_train['target'])

1.3274228064811984


target
0    0.57034
1    0.42966
Name: count, dtype: float64

In [4]:
stop_words = set(stopwords.words('english'))
wn = nltk.WordNetLemmatizer()
worddict = set(nltk.corpus.words.words())
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def remove_html(text):
    html = re.compile(r'<.*?>|&([A-Za-z0-9]+|#[0-9]{1,6}|#x[0-9a-fA-F]{1,6});')
    return re.sub(html, '', text)
def remove_url_func(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)
def remove_tweet_funct(text):
    text = re.sub(r'\@\w+|\#', '', text) #removes @mentions #removes hastag '#' symbol
    text = re.sub('RT[\s]+','',text)
    text = re.sub('\n',' ',text)
    return text
def remove_accented_chars_func(text):
    return ''.join([c for c in text if ord(c) < 128])

def load_dict_emoticons():
    
    return {
        r";(-?)\)":r"wink",
        r":(-?)\){1,}":r"smiley",
        r":(-?)D{1,}":r"smiley",
        r":(-?)3{1,}":r"smiley",
        r":(-?)[oO]{1,}":r"surprise",
        r":(-?)\({1,}":r"sad",
        r":@":r"sad",
        r"((xD)|(XD))":r"laugh",
        r"((XP)|(xp))":r"playful",
        r":[pPÞþb]":r"playful",
        r";-;":r"sad",
        r"<3":r"love"
        }
emoticons=load_dict_emoticons()
def convert_emoticons(text):
    for item in emoticons.keys():
        text=re.sub(r"(\s)|(\b)"+item,(" "+emoticons[item]),text)
    return text
def remove_emoticons(text):
    text=re.sub(r'' + r'|'.join("(\s"+k+")" for k in emoticons),r" ",text)
    return text
def clean_repeating_char(text):
    text=re.sub(r'([a-zA-Z])\1\1+', r'\1\1', text,flags=re.I)#(.)\1{3,}
    text = re.sub(r'\blo+l\b',"lol",text,flags=re.I)
    text = re.sub(r'go+a+l+','goal',text)
    text = re.sub(r'\bso+\b',"so",text,flags=re.I)
    text = re.sub(r'\bno+\b',"no",text,flags=re.I)
    text = re.sub(r'\bo+h+o*\b','oh',text,flags=re.I)
    text = re.sub(r'\ba+\b','a',text,flags=re.I)
    text = re.sub(r'\ba+ll\b','all',text,flags=re.I)
    text = re.sub(r'\ba+n+\b','an',text,flags=re.I)
    text = re.sub(r'\ba+n+d+\b','and',text,flags=re.I)
    text=re.sub(r'\bab+andon','abandon',text,flags=re.I)
    return text
def chat_words_to_norm_words_func(text):
    '''
    Replaces common chat expressions with their spelled out form
    
    Args:
        text (str): String to which the function is to be applied, string
    
    Returns:
        Clean string with replaced chat expressions
    ''' 
    return re.sub(r'\S+', lambda m: chat_expressions_dict.get(m.group().upper(), m.group()) , text)
def slang_spelling(text):
    text = re.sub(r"w/e", "whatever", text)
    text = re.sub(r"w/", "with", text)
    text = re.sub(r"U.S.", "America ", text)
    text = re.sub(r"\bUS\b", "America ", text)
    text = re.sub(r"recentlu", "recently", text)
    text = re.sub(r"Ph0tos", "Photos", text)
    text = re.sub(r"amirite", "am I right", text)
    text = re.sub(r"exp0sed", "exposed", text)
    text = re.sub(r"amageddon", "armageddon", text)
    text = re.sub(r"Trfc", "Traffic", text)
    text = re.sub(r"WindStorm", "Wind Storm", text)   
    text = re.sub(r"TRAUMATISED", "traumatized", text)
    text = re.sub(r"2k([0-9])([0-9])", r"20\1\2", text,flags=re.I)
    text = re.sub(r"\bwut\b", r"what", text,flags=re.I)
    text = re.sub(r"\byday\b", r"yesterday", text,flags=re.I)
    text = re.sub(r"\$hit", r"shit", text,flags=re.I)
    text = re.sub(r"\bint\'l\b", r"international", text)
    text = re.sub(r"\bgon\b", r"going to", text,flags=re.I)
    text = re.sub(r"\byday\b", r"yesterday", text,flags=re.I)
    text = re.sub(r'\baccidently\b',"accidentally",text)
    text = re.sub(r"\bbe4\b",'before',text)
    text = re.sub(r"\bb4federal\b",'before federal',text)
    text = re.sub(r"\bStree\b",'street',text)
    text = re.sub(r"\b(I-)(\d)",r'Intersate \2',text)
    text=re.sub(r"(\d+)(\s?)yr",r"\1 year",text)
    text = chat_words_to_norm_words_func(text)
    text = re.sub(r"\bS3XLEAK",'SEX LEAK',text)
    text = re.sub(r"\blil\b",' little ',text)
    text = re.sub(r"\balil\b",' a little ',text)
    text = re.sub(r"\btxt\b",' text',text,flags=re.I)
    text = re.sub(r"\btyre\b",'tire',text,flags=re.I)
    text = re.sub(r"\bsmh\b",'Shaking my head',text,flags=re.I)
    text = re.sub(r"view and download video",' ',text)
    text = re.sub(r"\bvia\b",' ',text,flags=re.I)
    return text
def clean_contractions(text):
    text=contractions.fix(text)
    text = re.sub(r"\bcant\b",'can not',text)
    text = re.sub(r"\bwont\b",'will not',text)
    text = re.sub(r"\bim\b",' I am ',text)
    text = re.sub(r"\bdidnt\b",' did not ',text)
    text = re.sub(r"\bcouldnt\b",'could not',text)
    text = re.sub(r"\bisnt\b",' is not ',text)
    text = re.sub(r"\bdont\b",' do not ',text)
    return text
def remove_irr_char_func(text):
    text=re.sub(r'\b\d{1,2}:\d{2}(\s?)(AM|PM|am|pm)'," ",text)
    text=re.sub(r'\b\d{1,2}(\s?)(AM|PM|am|pm)'," ",text)
    text=re.sub(r'\b\d{1,}(\s?)(th|TH|Th)'," ",text)
    text=re.sub(r'[wW]--(=-){1,}\['," ",text)
    text=re.sub(r'[^a-zA-Z]', ' ', text)
    return text
def remove_extra_whitespaces_func(text):
    return re.sub(r'^\s*|\s\s*', ' ', text).strip()
def clean_text(text):
    text=remove_html(text)
    text=remove_url_func(text)
    text=remove_tweet_funct(text)
    text=remove_accented_chars_func(text)
    text=remove_emoticons(text)
    text=clean_repeating_char(text)
    text=slang_spelling(text)
    text=clean_contractions(text)
    text=remove_irr_char_func(text)
    text=remove_extra_whitespaces_func(text)
    return text
def preprocessing(text):
    new_text = clean_text(text)
    tokens = word_tokenize(new_text.lower().strip())
#     new_text=" ".join([wn.lemmatize(w) for w in tokens if not w in stop_words])
#     return new_text
    wordset_n = set(wn.lemmatize(w, NOUN) for w in tokens)
    wordset_v = set(wn.lemmatize(w, VERB) for w in wordset_n)
    wordset = set(wn.lemmatize(w, ADJ) for w in wordset_v)
    wordset = wordset & worddict
    return ' '.join(list(wordset))


In [5]:
import emot
text="cool .;"
emot_obj = emot.emot()
emot_obj.emoticons(text)
def check_if_emoticon_present(text):
    emot_obj = emot.emot()
    if emot_obj.emoticons(text)['flag']:
        return 1
    else:
        return 0
check_if_emoticon_present(text)

0

In [6]:
data_train['clean_text']=data_train['text'].apply(lambda x: clean_text(x))
data_test['clean_text']=data_test['text'].apply(lambda x: clean_text(x))
data_train['new_text']=data_train['text'].apply(lambda x: preprocessing(x))
data_test['new_text']=data_test['text'].apply(lambda x: preprocessing(x))
data_train.to_csv('tem_train.csv',encoding='ISO-8859–1',index=False)
data_test.to_csv('tem_test.csv',encoding='ISO-8859–1',index=False)

In [7]:
features='new_text'#['new_text','keyword_target','location_clean_target']
x_train, x_test, y_train, y_test = train_test_split(
    data_train[features],
    data_train['target'],stratify=data_train['target'],
    test_size=0.25,
    random_state=2)

In [8]:
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer='word',ngram_range=(1, 1))
x_train_tf = tfidf_vect.fit_transform(x_train)
x_pre_tf = tfidf_vect.transform(x_test)
x_test_tf = tfidf_vect.transform(data_test['new_text'])

In [9]:
for item in tfidf_vect.get_feature_names_out():
    print(item)

aba
abandon
ability
abject
ablaze
able
aboard
abomination
abortion
about
abouts
above
absence
absolutely
abstract
absurdly
abuse
accept
access
accident
accidentally
accompany
accord
account
accuse
accustom
ache
achieve
achievement
acid
acne
acoustic
acquiesce
acquire
acre
acronym
across
acrylic
act
actin
action
activate
active
actively
activist
activity
actor
actress
actual
actually
acute
ad
add
addict
addiction
addition
address
adjust
adjustable
adjuster
administration
administrative
admit
adopt
adoption
adoptive
adult
advance
advantage
adventure
advertise
advice
advise
advisory
aeroplane
aesthetic
affair
affect
affiliate
affiliation
affliction
afloat
afraid
after
afterlife
aftermath
afternoon
aftershock
again
against
agalloch
age
agency
agent
aggression
aggressive
aggressively
agnus
ago
agony
agree
agreement
ah
ahead
ai
aid
aim
air
aircraft
airhead
airlift
airplane
airport
aisle
ak
aka
al
ala
alameda
alarm
alarmingly
alba
albeit
album
alchemist
alcohol
alcoholism
alec
alert
algae
ali

In [10]:
print(len(tfidf_vect.get_feature_names_out()))

5603


In [11]:
BNBmodel = BernoulliNB(alpha=1.07,fit_prior=True, class_prior=None)# alpha=1.05
BNBmodel.fit(x_train_tf, y_train) # train the classifier
predicted = BNBmodel.predict(x_pre_tf)
cr5    = classification_report(y_test,predicted)
print(cr5)
metrics.accuracy_score(list(y_test), predicted)

              precision    recall  f1-score   support

           0       0.80      0.87      0.83      1086
           1       0.81      0.71      0.76       818

    accuracy                           0.80      1904
   macro avg       0.80      0.79      0.79      1904
weighted avg       0.80      0.80      0.80      1904



0.8025210084033614

In [12]:
metrics.f1_score(list(y_test), predicted)

0.7552083333333334

In [13]:
cm = confusion_matrix(list(y_test), predicted)
cm

array([[948, 138],
       [238, 580]], dtype=int64)

In [14]:
predicted =BNBmodel.predict(x_test_tf)
pd.DataFrame({'id':data_test['id'],'target':predicted}).to_csv('submission_b_nb.csv',index=False)

In [15]:
data1=pd.read_csv('submission_b_nb.csv')
perfect_score=pd.read_csv('perfect_submission.csv')
metrics.accuracy_score(list(perfect_score['target']), list(data1['target']))

0.7903769537235673

In [16]:
metrics.f1_score(list(perfect_score['target']), list(data1['target']))

0.7359073359073358

In [17]:
cr5 = classification_report(list(perfect_score['target']), list(data1['target']))
print(cr5)
cm = confusion_matrix(list(perfect_score['target']), list(data1['target']))
cm

              precision    recall  f1-score   support

           0       0.78      0.87      0.83      1861
           1       0.80      0.68      0.74      1402

    accuracy                           0.79      3263
   macro avg       0.79      0.78      0.78      3263
weighted avg       0.79      0.79      0.79      3263



array([[1626,  235],
       [ 449,  953]], dtype=int64)

In [18]:
incorrect_rows=data1.loc[perfect_score['target']!=data1['target']]['id'].tolist()
print(len(incorrect_rows))

684


In [19]:
data_test[data_test['id'].isin(incorrect_rows)].to_csv('error_test.csv',encoding='ISO-8859–1',index=False)

In [21]:
test="'Since1970the 2 biggest depreciations in CAD:USD in yr b4federal election coincide w/landslide win for opposition' http://t.co/wgqKXmby3B"
re.sub(r"\bb4federal\b","before federal",test)

"'Since1970the 2 biggest depreciations in CAD:USD in yr before federal election coincide w/landslide win for opposition' http://t.co/wgqKXmby3B"