In [None]:
import re
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from itertools import chain 
import matplotlib.pyplot as plt
from nltk.collocations import *
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from nltk.stem import WordNetLemmatizer
import nltk.corpus.reader.wordnet as wordnet
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score

In [None]:
data = pd.read_csv('../input/medium-post-titles/medium_post_titles.csv')
data.head()

In [None]:
data.shape

In [None]:
data.fillna('', inplace=True, axis=0)

In [None]:
def combine(columns):
    title = columns[0]
    subtitle = columns[1]
    return title + ' ' + subtitle

data['text'] = data[['title', 'subtitle']].apply(combine, axis=1)

In [None]:
data.drop(['title', 'subtitle', 'subtitle_truncated_flag'], axis=1, inplace=True)

In [None]:
def horizontal_bar(plot_data, size, title):
    plt.figure(figsize=(size, size)) #set the size of the figure.
    plt.barh(list(plot_data.keys()), list(plot_data.values())) #plot the horizontal bar.
    plt.title(title) #set the title.
    
def tokenizer(sentences):
    tokens = []
    for sent in sentences:
        sent = sent.lower()  #convert the text to lower case. As Car has same meaning as that of car.
        matches = re.findall('[a-z]+', sent) #For classification words do the job.
        tokens.append(matches)
    return tokens

def most_frequent_words(words_lists, size=5):
    merge = []
    word_freq = {}
    
    for tokens in words_lists: #convert to 1d list.
        merge += tokens
    frequency = nltk.FreqDist(merge) #find the frequencies of each word.
    
    common = frequency.most_common(size) #get the most common words.
    for word, freq in common:
        word_freq[word] = freq
    return word_freq

def remove_stopwords(words_lists):
    stop_words = stopwords.words('english')
    new_tokens = []
    for lis in words_lists:
        new_list = []
        for word in lis:
            if word not in stop_words: #filter out the stopwords.
                new_list.append(word)
        new_tokens.append(new_list)
    return new_tokens

def get_longest_words_frequency(words_lists, length=5, freq=250):
    merge = []
    for tokens in new_tokens:
        merge += tokens
    frequency = nltk.FreqDist(merge)
    
    words = {}
    for word_list in words_lists:
        for word in word_list:
            if len(word) >= length and word not in words and frequency[word] >= freq: #filter.
                words[word] = frequency[word]
    return words

def detokenize(words_lists):
    sentences = []
    for words in words_lists:
        sentence = ' '.join(words) #join words seperating by a space.
        sentences.append(sentence)
    return sentences

def replace_collocations(sentences, scores, limit=10):
    for i in range(len(sentences)):
        for j in range(limit):
            word = scores[j][0][0] + ' ' + scores[j][0][1]
            if word in sentences[i]:
                sentences[i] = re.sub(word, scores[j][0][0] + scores[j][0][1], sentences[i])
    return sentences

def lemmatize(words_lists):
    lemmatizer = WordNetLemmatizer()
    
    for i in range(len(words_lists)):
        pos_tags = nltk.pos_tag(words_lists[i])
        
        tags = {}
        for t in pos_tags:
            tags[t[0]] = t[1]
        
        pos = {
            'NN' : wordnet.NOUN,
            'VB' : wordnet.VERB,
            'JJ' : wordnet.ADJ,
            'RB' : wordnet.ADV
        }
        for j in range(len(words_lists[i])):
            if tags[words_lists[i][j]][:2] in ['NN', 'VB', 'JJ', 'RB']:
                words_lists[i][j] = lemmatizer.lemmatize(words_lists[i][j], pos[tags[words_lists[i][j]][:2]])
            else:
                words_lists[i][j] = lemmatizer.lemmatize(words_lists[i][j])
            
    return words_lists

In [None]:
class ContitionalFrequencyHelper:
    
    def __init__(self, words, categories, words_lists):
        self.words = words
        self.categories = categories
        self.tokens = words_lists
        
    def get_processed_list(self):
        word_condition = []
        for index in range(len(self.tokens)):
            for word in self.words:
                if word in self.tokens[index]:
                    word_condition.append((self.categories[index], word))
        return word_condition
    
    def conditional_frequency(self):
        conditions = self.get_processed_list()
        cfd = nltk.ConditionalFreqDist(conditions)
        return cfd

In [None]:
counts = dict(data.category.value_counts())
horizontal_bar(counts, 20, 'Category Counts')

93 categories.

In [None]:
sentences = list(data['text'])
tokens = tokenizer(sentences)

In [None]:
freq_words = most_frequent_words(tokens, 10)
horizontal_bar(freq_words, 5, 'Most Frequent Words')

These most frequent words have zero contribution towards calssifying the sentence as they donot have a domain specific meaning and are used in almost all the sentences. These words are called stop words and have to be removed.

In [None]:
new_tokens = remove_stopwords(tokens)

In [None]:
high_frequency = get_longest_words_frequency(new_tokens, 10, 500)

#analyse the first two words. blockchain and cryptocurrency.
words = ['blockchain', 'cryptocurrency', 'artificial', 'intelligence']
categories = list(data['category'])

cfd_helper = ContitionalFrequencyHelper(words, categories, new_tokens)
cfd = cfd_helper.conditional_frequency()

Contitional Frequency Distribution: https://www.kaggle.com/thecobbler/conditional-frequency-distribution-basics by Arun

In [None]:
cfd.tabulate()

From the table above we can say that:
1. Long words will usually carry some meaning in it and help in classification.
2. Blockchain word is frequently used in sentences of Blockchain category which makes sense. But we see that blockchain word is used more than cryptocurrency in the sentences of category Cryptocurrency. This is because Blockchain is the technology used in Cryptocurrency hence Blockchain is used there as well.
3. Some words makes sense when they exist together. Like artificial and intelligence in this case. Such words are called collocations.

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()

merge = []
    
for tokens in new_tokens:
    merge += tokens

collocation_finder = BigramCollocationFinder.from_words(merge)

collocation_scored = collocation_finder.score_ngrams(bigram_measures.raw_freq)

In [None]:
sentences = detokenize(new_tokens)
limit = 30 #top 50 collocations.
new_sentences = replace_collocations(sentences, collocation_scored, limit) #remove the spces between the words.

In [None]:
final_tokens = tokenizer(new_sentences)

In [None]:
#final data to train.
final_sentences = detokenize(final_tokens)
data = {
    'Category' : categories,
    'Text' : final_sentences
}
    
final_data = pd.DataFrame(data)
final_data.head()

In [None]:
text = list(final_data['Text'])
labels = list(final_data['Category'])
X_train, X_test, y_train, y_test = train_test_split(text, labels, stratify=labels, test_size=0.20)

In [None]:
steps = [('vectorize', CountVectorizer()), ('NB', MultinomialNB())]
pipeline = Pipeline(steps)
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
print('accuracy: ', accuracy_score(y_pred, y_test))

In [None]:
print(classification_report(y_pred, y_test))

In [None]:
model = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=50, tol=0.001))
])


model.fit(X_train,y_train)
y_pred = model.predict(X_test)

print('accuracy: ', accuracy_score(y_pred, y_test))

In [None]:
print(classification_report(y_pred, y_test))

SGD classifier (46%) performed better than Naive Bias (39%).

## Consider upvoting the notebook if you have learnt something from this. Thank You.