# TF-IDF

In [185]:
import math
import nltk

from nltk import sent_tokenize, word_tokenize, PorterStemmer, pos_tag
from nltk.corpus import stopwords

nltk.download('punkt');
nltk.download('stopwords');
nltk.download('averaged_perceptron_tagger');

[nltk_data] Downloading package punkt to /Users/qa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/qa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/qa/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### 1. Tokenize the sentences

In [2]:
f = open("original_text.txt", "r")
text = f.read()
sentences = sent_tokenize(text)
total_documents = len(sentences)

### 2. Create the Frequency matrix of the words in each sentence

In [3]:
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

freq_matrix = _create_frequency_matrix(sentences);
# print(freq_matrix);

### 3. Calculate TermFrequency and generate a matrix

In [4]:
def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

tf_matrix = _create_tf_matrix(freq_matrix)

### 4. Creating table for documents per words

In [5]:
def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

count_doc_per_words = _create_documents_per_words(freq_matrix)

### 5. Calculate IDF and generate a matrix

In [6]:
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)

### 6. Calculate TF-IDF and generate a matrix

In [7]:
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)

### 7. Important Algorithm: score the sentences

In [8]:
def _score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue

sentence_scores = _score_sentences(tf_idf_matrix)

### 8. Find the threshold

In [9]:
def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average

threshold = _find_average_score(sentence_scores)

### 9. Important Algorithm: Generate the summary

In [10]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)
print(summary);

 Have you experienced this before? Who is right and who is wrong? Neither. It was at that point their biggest breakthrough came. Perhaps all those years of perseverance finally paid off. It must come from within you. Where are you settling in your life right now? Could you be you playing for bigger stakes than you are? So become intentional on what you want out of life. Commit to it. Nurture your dreams.


# Tweet Analysis

## Import

In [213]:
%matplotlib inline
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, learning_curve, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix, roc_curve
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

from joblib import dump, load

In [192]:
# df = pd.read_csv('df_20.csv')
df = pd.read_csv('df_9.csv')
print(df.head());

# 0 -> hate speech
# 1 -> offensive language
# 2 -> neither

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  


## Data Preprocessing
1. Transform class column to binary classification: hate speech or not hate speech
2. Remove stopwords, tweet handles/mentions and punctuation
    - excludes [ 'rt' , '&#57361;' ]
3. Lowercase and lemmatize words
4. Use unigrams and bigrams
5. Part-of-speech tags

In [195]:
df['isHateSpeech'] = df['class'].map(lambda x: 0 if x==0 else 1)

In [196]:
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,isHateSpeech
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,1
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1


In [226]:
all_stopwords = stopwords.words('english')
excludes = ['rt', '&#57361;']
all_stopwords.extend(excludes)

def preprocess(tweet: str):
    space_pattern = '\s+'
    url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
#     mention_regex = '@[\w\-]+'
    mention_regex = '@[^\s]+'
    symbol_regex = '&#[^\s]+'
    
    parsed_tweet = tweet.lower()
    parsed_tweet = re.sub(space_pattern, ' ', parsed_tweet)
    parsed_tweet = re.sub(url_regex, 'URLHERE', parsed_tweet)
    parsed_tweet = re.sub(symbol_regex, ' ', parsed_tweet)
    parsed_tweet = re.sub(mention_regex, 'MENTIONHERE', parsed_tweet)

#     words = word_tokenize(parsed_tweet)
    
#     filtered_words = [word for word in words if not word in all_stopwords and word.isalnum()]
#     porter = PorterStemmer()
#     stemmed = [porter.stem(word) for word in filtered_words if word not in ['URLHERE', 'MENTIONHERE']]
    
#     pos = pos_tag(filtered_words)
    
    return parsed_tweet


def stem_words(tweet: str):
    words = word_tokenize(tweet)
    filtered_words = [word for word in words if not word in all_stopwords and word.isalnum()]
    porter = PorterStemmer()
    return [porter.stem(word) for word in filtered_words if word not in ['URLHERE', 'MENTIONHERE']]

In [228]:
df.iloc[4]['tweet']

'!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you hear about me might be true or it might be faker than the bitch who told it to ya &#57361;'

In [229]:
pd.set_option('display.max_colwidth', 100)
df.tweet.head().apply(preprocess).apply(stem_words)

0      [woman, complain, clean, hous, amp, man, alway, take, trash]
1    [boy, dat, cold, tyga, dwn, bad, cuffin, dat, hoe, 1st, place]
2               [dawg, ever, fuck, bitch, start, cri, confus, shit]
3                                              [look, like, tranni]
4          [shit, hear, might, true, might, faker, bitch, told, ya]
Name: tweet, dtype: object

## Vectorize data
1. Bag of words model
2. Term frequency/ Inverse document frequence (tf-idf)
3. Normalize vectors to unit length

Vectorizer
- CountVectorizer
- TfdifVectorizer

In [232]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['isHateSpeech'], test_size=0.2, random_state=42)

In [239]:
# Define a count vectorizer
count_vectorizer = CountVectorizer(
    preprocessor=preprocess,
    stop_words=all_stopwords,
    tokenizer=stem_words)
counts = count_vectorizer.fit_transform(X_train)

# Define a tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocess,
                                   tokenizer=stem_words, 
                                   ngram_range=(1,3), 
                                   stop_words=all_stopwords,
                                   use_idf=True,
                                   smooth_idf=False,
                                   norm=None, #Applies l2 norm smoothing
                                   decode_error='replace',
                                   max_features=10000,
                                   min_df=5,
                                   max_df=0.501)
tfidfs = tfidf_vectorizer.fit_transform(X_train)



In [157]:
count_vectorizer.vocabulary_

{'well': 13698,
 'els': 4358,
 'white': 13792,
 'ppl': 9914,
 'get': 5415,
 'us': 13271,
 'forget': 5021,
 'horrif': 6313,
 'past': 9476,
 'paint': 9394,
 'pretti': 9967,
 'pictur': 9674,
 'ho': 6192,
 '8230': 738,
 'funni': 5244,
 'thing': 12555,
 'peopl': 9566,
 'see': 11071,
 'pic': 9662,
 'judg': 7019,
 'bird': 1879,
 'wrong': 14019,
 'nigga': 8875,
 'mess': 8241,
 'bitch': 1898,
 '128557': 340,
 '128514': 298,
 'ass': 1326,
 'nigggaaa': 8889,
 'real': 10343,
 'speak': 11762,
 'time': 12668,
 'tryna': 12958,
 'make': 7964,
 'look': 7754,
 'bad': 1506,
 'like': 7599,
 'eat': 4241,
 'pussi': 10148,
 'nah': 8680,
 'mimi': 8338,
 'basebal': 1624,
 'season': 11053,
 'win': 13874,
 'yanke': 14107,
 'love': 7808,
 'start': 11910,
 'alway': 1080,
 'hatin': 5967,
 'lmao': 7676,
 'nude': 9039,
 'call': 2479,
 'would': 13998,
 'rather': 10315,
 'hey': 6107,
 'girl': 5474,
 'send': 11098,
 'tit': 12687,
 'hoe': 6198,
 'babi': 1476,
 'cook': 3217,
 'bae': 1520,
 'dinner': 3854,
 'text': 12469,


In [247]:
X_train.values

array(['RT @FunSizedYogi: @TheBlackVoice well how else will white ppl get us to forget our horrific past other than to paint a pretty picture of ho&#8230;',
       "Funny thing is....it's not just the people doing it. It's the people who seeing these pics and judging the birds. Just as wrong.",
       'RT @winkSOSA: "@AintShitSweet__: "@Rakwon_OGOD: Nigga messed with the wrong bitch &#128557;&#128514;https://t.co/5mNXKVAYot" &#128557;&#128557;&#128557;&#128557;&#128514;&#128514;&#128557;&#128557;&#128514;&#128514;&#128514;&#128514;&#128514;&#128514;&#128514;"@Th_Real_Esco',
       ...,
       '#porn,#android,#iphone,#ipad,#sex,#xxx, | #Anal | Hardcore british queer anal pounding http://t.co/lRuEixMy21',
       "RT @JennyJohnsonHi5: Just when I thought Justin Bieber couldn't be anymore of a pussy, he gets arrested in Canada for fighting a person who&#8230;",
       'bitches ain&#8217;t shit, and they ain&#8217;t saying nothin&#8217;'],
      dtype=object)

In [259]:
pipeline = Pipeline([
#     ('vectorizer', count_vectorizer),
    ('tfidf', tfidf_vectorizer),
    ('classifier', MultinomialNB())
])

pipeline.fit(X_train.values, y_train.values);
dump(pipeline, 'model.joblib')

['model.joblib']

In [116]:
res = pipeline.predict(X_test)

In [250]:
pipeline.predict(["hi"])[0]

1

## Cross Validation

In [251]:
k_fold = KFold(n_splits=5)
scores = []
y_true = []
y_pred = []
probas = []
confusion = np.array([[0, 0], [0, 0]])

In [260]:
for train_indices, test_indices in k_fold.split(counts):
    train_text = df.iloc[train_indices]['tweet']
    train_y = df.iloc[train_indices]['isHateSpeech']

    test_text = df.iloc[test_indices]['tweet']
    test_y = df.iloc[test_indices]['isHateSpeech']

    pipeline.fit(train_text, train_y)
    predictions = pipeline.predict(test_text)
    prediction_probas = pipeline.predict_proba(test_text)

    confusion += confusion_matrix(test_y, predictions)
    score = accuracy_score(test_y, predictions)
    scores.append(score)
    
    y_true.extend(test_y)
    y_pred.extend(predictions.tolist())
    probas.extend(prediction_probas[:, 1].tolist())

In [261]:
dump(pipeline, 'model.joblib')

['model.joblib']

In [253]:
print('Total tweets classified:', len(df))
print('Accuracy:', sum(scores)/len(scores))
print('Confusion matrix:')
print(confusion)

Total tweets classified: 24783
Accuracy: 0.8526180478582452
Confusion matrix:
[[  692   504]
 [ 2418 16212]]


### Problems
- Model is predicting hate speech based on keyword only -> context is not considered
- Lack of hate speech data -> resampling, StratifiedKFold, SMOTE
- Synthesisis of new minority class instances
- Over-sampling of minority class
- Under-sampling of majority class
- Tweak the cost function to make misclassification of minority instances more important than misclassification of majority instances


### Metrics
- False positive should be minimized
- Macro average in classification_report

In [257]:
pipeline.predict(["gay"])[0]

0

In [211]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1196
           1       0.94      1.00      0.97     18630

    accuracy                           0.94     19826
   macro avg       0.47      0.50      0.48     19826
weighted avg       0.88      0.94      0.91     19826



  _warn_prf(average, modifier, msg_start, len(result))


## ROC Curve

In [214]:
# Compute ROC curve and ROC area for each class
fpr, tpr, _ = roc_curve(y_true, probas, pos_label='spam')
roc_auc = auc(fpr, tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

  y_true = (y_true == pos_label)


TypeError: 'bool' object is not subscriptable