# TF-IDF

In [32]:
import math
import nltk

from nltk import sent_tokenize, word_tokenize, PorterStemmer, pos_tag
from nltk.corpus import stopwords

nltk.download('punkt');
nltk.download('stopwords');
nltk.download('averaged_perceptron_tagger');

[nltk_data] Downloading package punkt to /Users/qa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/qa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/qa/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### 1. Tokenize the sentences

In [2]:
f = open("original_text.txt", "r")
text = f.read()
sentences = sent_tokenize(text)
total_documents = len(sentences)

### 2. Create the Frequency matrix of the words in each sentence

In [3]:
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

freq_matrix = _create_frequency_matrix(sentences);
# print(freq_matrix);

### 3. Calculate TermFrequency and generate a matrix

In [4]:
def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

tf_matrix = _create_tf_matrix(freq_matrix)

### 4. Creating table for documents per words

In [5]:
def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

count_doc_per_words = _create_documents_per_words(freq_matrix)

### 5. Calculate IDF and generate a matrix

In [6]:
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)

### 6. Calculate TF-IDF and generate a matrix

In [7]:
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)

### 7. Important Algorithm: score the sentences

In [8]:
def _score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue

sentence_scores = _score_sentences(tf_idf_matrix)

### 8. Find the threshold

In [9]:
def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average

threshold = _find_average_score(sentence_scores)

### 9. Important Algorithm: Generate the summary

In [10]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)
print(summary);

 Have you experienced this before? Who is right and who is wrong? Neither. It was at that point their biggest breakthrough came. Perhaps all those years of perseverance finally paid off. It must come from within you. Where are you settling in your life right now? Could you be you playing for bigger stakes than you are? So become intentional on what you want out of life. Commit to it. Nurture your dreams.


# Tweet Analysis

In [95]:
%matplotlib inline
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, learning_curve
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

from joblib import dump, load

In [134]:
# df = pd.read_csv('df_20.csv')
df = pd.read_csv('df_9.csv')
print(df.head());

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  


In [137]:
all_stopwords = stopwords.words('english')
excludes = ['@user', '@', '!', 'RT']
all_stopwords.extend(excludes)

def preprocess(tweet: str):
    space_pattern = '\s+'
    url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
#     mention_regex = '@[\w\-]+'
    mention_regex = '@[^\s]+'
    
    parsed_tweet = tweet.lower()
    parsed_tweet = re.sub(space_pattern, ' ', parsed_tweet)
    parsed_tweet = re.sub(url_regex, 'URLHERE', parsed_tweet)
    parsed_tweet = re.sub(mention_regex, 'MENTIONHERE', parsed_tweet)

#     words = word_tokenize(parsed_tweet)
    
#     filtered_words = [word for word in words if not word in all_stopwords and word.isalnum()]
#     porter = PorterStemmer()
#     stemmed = [porter.stem(word) for word in filtered_words if word not in ['URLHERE', 'MENTIONHERE']]
    
#     pos = pos_tag(filtered_words)
    
    return parsed_tweet

def stem_words(tweet: str):
    words = word_tokenize(tweet)
    filtered_words = [word for word in words if not word in all_stopwords and word.isalnum()]
    porter = PorterStemmer()
    return [porter.stem(word) for word in filtered_words if word not in ['URLHERE', 'MENTIONHERE']]

In [138]:
df.tweet.head().apply(preprocess)

0    !!! rt MENTIONHERE as a woman you shouldn't co...
1    !!!!! rt MENTIONHERE boy dats cold...tyga dwn ...
2    !!!!!!! rt MENTIONHERE dawg!!!! rt MENTIONHERE...
3    !!!!!!!!! rt MENTIONHERE MENTIONHERE she look ...
4    !!!!!!!!!!!!! rt MENTIONHERE the shit you hear...
Name: tweet, dtype: object

In [139]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['class'], test_size=0.2, random_state=42)

In [140]:
# Define a count vectorizer
count_vectorizer = CountVectorizer(preprocessor=preprocess, analyzer=stem_words)
counts = count_vectorizer.fit_transform(X_train)

# Define a tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocess, analyzer=stem_words, ngram_range=(1,3))
tfidfs = tfidf_vectorizer.fit_transform(X_train)

In [141]:
count_vectorizer.vocabulary_

{'funsizedyogi': 7860,
 'theblackvoic': 19031,
 'well': 20978,
 'els': 6703,
 'white': 21121,
 'ppl': 15280,
 'get': 8087,
 'us': 20347,
 'forget': 7561,
 'horrif': 9305,
 'past': 14694,
 'paint': 14593,
 'pretti': 15343,
 'pictur': 14971,
 'ho': 9148,
 '8230': 866,
 'funni': 7848,
 'thing': 19274,
 'peopl': 14828,
 'It': 1085,
 'see': 16981,
 'pic': 14958,
 'judg': 10693,
 'bird': 3013,
 'just': 10746,
 'wrong': 21438,
 'winksosa': 21266,
 'nigga': 13834,
 'mess': 12793,
 'bitch': 3035,
 '128557': 363,
 '128514': 321,
 'http': 9368,
 'jbrendaro30': 10316,
 'zgabrail': 21905,
 'ramsin1995': 15767,
 'gabeeli8': 7914,
 'jacob2tim': 10162,
 'ass': 2156,
 'nigggaaa': 13851,
 'real': 15877,
 'speak': 17946,
 'time': 19443,
 'tryna': 19913,
 'make': 12351,
 'look': 12021,
 'bad': 2421,
 'like': 11777,
 'eat': 6541,
 'pussi': 15589,
 'nah': 13532,
 'mimi': 12975,
 'domworldpeac': 6156,
 'basebal': 2589,
 'season': 16958,
 'win': 21250,
 'yanke': 21604,
 'thi': 19262,
 'love': 12086,
 'start':

In [133]:
df[df['tweet'].str.contains('@funsizedyogi')]

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet


In [149]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(preprocessor=preprocess, analyzer=stem_words)),
    ('classifier', MultinomialNB())
])

pipeline.fit(X_train, y_train);
dump(pipeline, 'model.joblib')

['model.joblib']

In [116]:
res = pipeline.predict(X_test)

In [148]:
pipeline.predict(["hi"])

array([1])