In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score,f1_score,recall_score,precision_score, fbeta_score
import xgboost as xgb
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/simran.tyagi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#!pip install ftfy
#!pip install scrubadub
#!pip install demoji
#!pip install tabulate
#!pip install wordcloud
#pip install xgboost

In [3]:
def read_file():
    df = pd.read_csv('/home/simran.tyagi/Downloads/panasonic_v1_f.csv')
    df = df[['text', 'Complaint']]
    return df


In [4]:
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
import re
import ftfy
import scrubadub
# import string
from tabulate import tabulate
import demoji
# import emoji
from nltk.corpus import stopwords


stop = stopwords.words('english')
emailDetector = scrubadub.Scrubber(detector_list=[scrubadub.detectors.EmailDetector])
wnl = WordNetLemmatizer()

label_codes = {'No': 0, 'Yes': 1}
t_handle_regex = r'(^|[^@\w])@(\w{1,15})\b'
t_hashtag_regex = r"#(\w+)"
t_url_regex = r"https?://\S+|www\.\S+"
t_markup_regex = r"<(\"[^\"]*\"|'[^']*'|[^'\">])*>"
t_handle_placeholder = ' {{HANDLE}}'
t_hashtag_placeholder = ' {{HASHTAG}}'
t_url_placeholder = '{{URL}}'
t_markup_placeholder = '{{MARKUP}}'
emoji_placeholder = '{{EMOJI}}'
# domain specific stopwords.
stop.extend(['panasonic'])

# table = str.maketrans("", "")


def penn_to_wn(tag):
    def is_noun(tag):
        return tag in ['NN', 'NNS', 'NNP', 'NNPS']

    def is_verb(tag):
        return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

    def is_adverb(tag):
        return tag in ['RB', 'RBR', 'RBS']

    def is_adjective(tag):
        return tag in ['JJ', 'JJR', 'JJS']

    # Pos tags to wn tags
    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return None

def to_lower_case(text):
    return text.lower()

def fix_unicode(text):
    return ftfy.fix_text(text)

def replace_email(text):
    return emailDetector.clean(text)

def remove_stop_words(text):
    return ' '.join([word for word in text.split() if word not in (stop)])

def convert_emoji_to_text(text):
    return text

def replace_user_name(text):
    return re.sub(t_handle_regex, t_handle_placeholder, text)

def replace_hashtags(text):
    return re.sub(t_hashtag_regex, t_hashtag_placeholder,text)

def replace_url(text):
    return re.sub(t_url_regex, t_url_placeholder, text)

def replace_markup(text):
    return re.sub(t_markup_regex, t_markup_placeholder,text)

def remove_punctuations(text):
    return re.sub(r'[^\w\s]', '', text)

def replace_emoji_with_code(text):
    demoji.replace(text, repl=emoji_placeholder)
    return demoji.replace_with_desc(text)

def get_stats(step, df):
    corpus = " ".join(list(df['text']))
    total_words = len(corpus.split(' '))
    unique_words = len(set(corpus.split(' ')))
    return [step, total_words, unique_words]

def lemmatize(text):
    default_wn_tag = 'n'
    tokens = text.split(' ')
    pos_tags = nltk.pos_tag(tokens)
    wn_tags = [penn_to_wn(tag) for (w, tag) in pos_tags]
    # print(list(zip(pos_tags, wn_tags)))
    lemmas = [wnl.lemmatize(token, tag or default_wn_tag) for (token, tag) in list(zip(tokens, wn_tags))]
    return ' '.join(lemmas)

stats = [['Step', 'Total words', 'Unique words']]
df = read_file()
stats.append(get_stats('Start', df))
df = df.replace(label_codes)

df['text'] = df['text'].apply(lambda text: text.lower())
stats.append(get_stats('Lower', df))

df['text'] = df['text'].apply(lambda text: lemmatize(text))
stats.append(get_stats('Lemmatize', df))

df['text'] = df['text'].apply(fix_unicode)
stats.append(get_stats('Unicode Fix', df))

df['text'] = df['text'].apply(replace_emoji_with_code)
stats.append(get_stats('Replace emoji', df))

df['text'] = df['text'].apply(remove_stop_words)
stats.append(get_stats('Stop words', df))

df['text'] = df['text'].apply(replace_email)
stats.append(get_stats('Email Replace', df))

df['text'] = df['text'].apply(replace_user_name)
stats.append(get_stats('UserName replace', df))

df['text'] = df['text'].apply(replace_hashtags)
stats.append(get_stats('HashTags Replace', df))

df['text'] = df['text'].apply(replace_url)
stats.append(get_stats('URL Replace', df))

df['text'] = df['text'].apply(replace_markup)
stats.append(get_stats('MARKUP Replace', df))

df['text'] = df['text'].apply(remove_punctuations)
stats.append(get_stats('Remove punctuation', df))


print(tabulate(stats))


------------------  -----------  ------------
Step                Total words  Unique words
Start               77684        20298
Lower               77684        18224
Lemmatize           77684        17402
Unicode Fix         77691        17336
Replace emoji       78144        17451
Stop words          59345        16833
Email Replace       59345        16826
UserName replace    60034        15499
HashTags Replace    75185        11377
URL Replace         75185        9179
MARKUP Replace      75185        9178
Remove punctuation  75185        7210
------------------  -----------  ------------


In [5]:

tfidf = TfidfVectorizer(lowercase=False, max_df=.8, min_df=0.01)  
tfidf_wm = tfidf.fit_transform(df['text']).toarray() 
x_train, x_test, y_train, y_test = train_test_split(tfidf_wm, df['Complaint'], random_state = 42) 


In [6]:
model_x = xgb.XGBClassifier(use_label_encoder=False,
 learning_rate = 0.35, ## helps in avoding the overfitting problem and increase in learning rate affects TP and FN                            
 n_estimators=56,## number of trees to be built and increase in n_estimators affects TP and FP
 max_depth=3,##  represents the depth of each tree and increase in max_depth affects TN and FN
 subsample=0.8, # for each tree the % of rows taken to build the tree and increase in subsample affects TN and FN 
 min_child_weight=1, ##Defines the minimum sum of weights of all observations required in a child and increase in min_child_weight affects TN and FN 
 reg_alpha = 0.6,## penalizes the features which increase cost function and high value of reg_alpha affects TN and FN rate 
 reg_lambda = 0.6,## encourages the weights to be small and increase in reg_lambda affects TN and FN
 seed = 42).fit(x_train, y_train)



In [9]:
pred_test_x = model_x.predict(x_test)
print("Accuracy Score:",round(accuracy_score(y_test,pred_test_x),3))
print("Recall-Score:",round(recall_score(y_test,pred_test_x),3))
print("Precision-Score:",round(precision_score(y_test,pred_test_x),3))
print("F1-score:",round(f1_score(y_test,pred_test_x),3))
print("F2-Score:",round(fbeta_score(y_test,pred_test_x,beta = 2),2))

Accuracy Score: 0.957
Recall-Score: 0.817
Precision-Score: 0.873
F1-score: 0.844
F2-Score: 0.83


In [10]:
c_matrix = confusion_matrix(y_test, pred_test_x)
print('Confusion matrix\n\n', c_matrix)

print('\nTrue Positives(TP) = ', c_matrix[0,0])

print('\nTrue Negatives(TN) = ', c_matrix[1,1])

print('\nFalse Positives(FP) = ', c_matrix[0,1])

print('\nFalse Negatives(FN) = ', c_matrix[1,0])

Confusion matrix

 [[640  13]
 [ 20  89]]

True Positives(TP) =  640

True Negatives(TN) =  89

False Positives(FP) =  13

False Negatives(FN) =  20
