In [1]:
import pandas as pd
import numpy as np



In [2]:
medium = pd.read_csv('../data/processed/medium.csv')

In [3]:
# Sort by claps
medium.sort_values(by='claps', ascending=False, ignore_index=True, inplace=True)

In [4]:
medium.head()

Unnamed: 0,title,subtitle,image,author,publication,year,month,day,tag,reading_time,claps,comment,url,author_url,fake_title,fake_author,date
0,Design better data tables,The ingredients of a successful data table UI,1,Andrew Coyle,UX Collective,2017,5,7,data-science,4,53000,0,https://uxdesign.cc/design-better-data-tables-...,https://uxdesign.cc/@CoyleAndrew,0,0,2017-05-07
1,Artificial IntelligenceThe Revolution Hasnt Ha...,,1,Michael Jordan,,2018,4,19,data-science,16,50000,0,https://medium.com/@mijordan3/artificial-intel...,https://medium.com/@mijordan3,0,0,2018-04-19
2,Why so many data scientists are leaving their ...,Frustrations of the data scientist!,1,Jonny Brooks-Bartlett,Towards Data Science,2018,3,28,data-science,8,47000,0,https://towardsdatascience.com/why-so-many-dat...,https://towardsdatascience.com/@jonnybrooks04,0,0,2018-03-28
3,What exactly can you do with Python? Here are ...,,1,YK Sugi,Towards Data Science,2018,6,15,data-science,10,42000,0,https://towardsdatascience.com/what-can-you-do...,https://towardsdatascience.com/@ykdojo,0,0,2018-06-15
4,How to build your own Neural Network from scra...,A beginners guide to understanding the,1,James Loy,Towards Data Science,2018,5,14,data-science,7,41000,0,https://towardsdatascience.com/how-to-build-yo...,https://towardsdatascience.com/@jamesloyys,0,0,2018-05-14


### Classical Methods for Classification

In [91]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

Let's put claps in bins based on logarithmic ranges. So 0 - 100 would have label `low`, 100 - 1000 would have label `medium` etc.

In [48]:
bins = [0, 100, 1000, 10000, 100000]
labels = ['low', 'medium', 'high', 'xhigh']
medium['claps_category'] = pd.cut(medium['claps'], bins, labels=labels, include_lowest=True)

In [61]:
medium.columns

Index(['title', 'subtitle', 'image', 'author', 'publication', 'year', 'month',
       'day', 'tag', 'reading_time', 'claps', 'comment', 'url', 'author_url',
       'fake_title', 'fake_author', 'date', 'claps_category'],
      dtype='object')

In [63]:
data = medium.loc[:, ['title', 'claps_category']]

In [65]:
data.head()

Unnamed: 0,title,claps_category
0,Design better data tables,xhigh
1,Artificial IntelligenceThe Revolution Hasnt Happened Yet,xhigh
2,Why so many data scientists are leaving their jobs,xhigh
3,What exactly can you do with Python? Here are Pythons 3 main applications.,xhigh
4,How to build your own Neural Network from scratch in Python,xhigh


In [87]:
train, test = train_test_split(data)

In [88]:
train.shape

(55939, 2)

In [89]:
test.shape

(18647, 2)

In [92]:
text_clf = Pipeline([
    ('vect', CountVectorizer()), 
    ('tfidf', TfidfTransformer()), 
    ('clf', RandomForestClassifier())
])

In [93]:
text_clf.fit(train.title, train.claps_category)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                

In [94]:
predicted = text_clf.predict(test.title)
test['predicted'] = predicted

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [95]:
freeze_header(test, num_rows=10, step_rows=10)

interactive(children=(IntSlider(value=10, description='rows', max=18647, min=10, readout=False, step=10), IntS…

In [96]:
np.mean(predicted == test.claps_category)

0.7646270177508446

In [82]:
medium['claps_category'].value_counts()

low       57364
medium    15489
high       1677
xhigh        56
Name: claps_category, dtype: int64

In [99]:
test.loc[test['predicted'] == 'medium']

Unnamed: 0,title,claps_category,predicted
50327,Feature Selection,low,medium
1419,Variational AutoEncoders for new fruits with Keras and Pytorch.,high,medium
63926,Introduction to Naive Bayes Classifier.,low,medium
27643,A Minimalist End-to-End Scrapy Tutorial (Part IV),low,medium
174,Data Science Interview Guide,high,medium
...,...,...,...
3170,Optimizing Hyperparameters in Random Forest Classification,medium,medium
9080,How PyTorch lets you build and experiment with a neural net,medium,medium
14480,Cmo ser Data Scientist y NO Morir en el IntentoParte 2,medium,medium
33774,Analyzing dynamic strategies,low,medium


In [105]:
text_clf.predict(np.array(['Why you are not a good data scientist']))

array(['low'], dtype=object)

1. Improve classification by breaking down data based on months since publication. This way you can predict how many claps an article would get within certain time frame
1. Maybe do regression instead of classification to avoid dealing with rare classes (i.e high and xhigh)
1. Generate high clap title using gpt-2

### Classification

Let's put claps in bins based on logarithmic ranges. So 0 - 100 would have label `low`, 100 - 1000 would have label `medium` etc.

In [48]:
bins = [0, 100, 1000, 10000, 100000]
labels = ['low', 'medium', 'high', 'xhigh']
medium['claps_category'] = pd.cut(medium['claps'], bins, labels=labels, include_lowest=True)

In [61]:
medium.columns

Index(['title', 'subtitle', 'image', 'author', 'publication', 'year', 'month',
       'day', 'tag', 'reading_time', 'claps', 'comment', 'url', 'author_url',
       'fake_title', 'fake_author', 'date', 'claps_category'],
      dtype='object')

In [63]:
data = medium.loc[:, ['title', 'claps_category']]

In [65]:
data.head()

Unnamed: 0,title,claps_category
0,Design better data tables,xhigh
1,Artificial IntelligenceThe Revolution Hasnt Happened Yet,xhigh
2,Why so many data scientists are leaving their jobs,xhigh
3,What exactly can you do with Python? Here are Pythons 3 main applications.,xhigh
4,How to build your own Neural Network from scratch in Python,xhigh


In [91]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [87]:
train, test = train_test_split(data)

In [88]:
train.shape

(55939, 2)

In [89]:
test.shape

(18647, 2)

In [92]:
text_clf = Pipeline([
    ('vect', CountVectorizer()), 
    ('tfidf', TfidfTransformer()), 
    ('clf', RandomForestClassifier())
])

In [93]:
text_clf.fit(train.title, train.claps_category)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                

In [94]:
predicted = text_clf.predict(test.title)
test['predicted'] = predicted

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [95]:
freeze_header(test, num_rows=10, step_rows=10)

interactive(children=(IntSlider(value=10, description='rows', max=18647, min=10, readout=False, step=10), IntS…

In [96]:
np.mean(predicted == test.claps_category)

0.7646270177508446

In [82]:
medium['claps_category'].value_counts()

low       57364
medium    15489
high       1677
xhigh        56
Name: claps_category, dtype: int64

In [99]:
test.loc[test['predicted'] == 'medium']

Unnamed: 0,title,claps_category,predicted
50327,Feature Selection,low,medium
1419,Variational AutoEncoders for new fruits with Keras and Pytorch.,high,medium
63926,Introduction to Naive Bayes Classifier.,low,medium
27643,A Minimalist End-to-End Scrapy Tutorial (Part IV),low,medium
174,Data Science Interview Guide,high,medium
...,...,...,...
3170,Optimizing Hyperparameters in Random Forest Classification,medium,medium
9080,How PyTorch lets you build and experiment with a neural net,medium,medium
14480,Cmo ser Data Scientist y NO Morir en el IntentoParte 2,medium,medium
33774,Analyzing dynamic strategies,low,medium


In [105]:
text_clf.predict(np.array(['Why you are not a good data scientist']))

array(['low'], dtype=object)

### Sentence Transfomer

In [9]:
from sentence_transformers import SentenceTransformer

In [10]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

100%|██████████| 405M/405M [01:17<00:00, 5.20MB/s]   


In [11]:
# Run this cell on Google Colab
title_embeddings = model.encode(medium['title'].values.tolist())

KeyboardInterrupt: 

1. Improve classification by breaking down data based on months since publication. This way you can predict how many claps an article would get within certain time frame
1. Maybe do regression instead of classification to avoid dealing with rare classes (i.e high and xhigh)
1. Generate high clap title using gpt-2