In [290]:
import pandas as pd
import ast
from collections import Counter 
from collections import deque
import re
import nltk
from nltk.corpus import stopwords

In [483]:
medium_articles = pd.read_csv('medium_articles.csv')
medium_articles.head()

def clean(lst):
    """Fixes error in my tag_list code (for more info read the README.md file)"""
    lst = ast.literal_eval(lst) # convert string list into actual list
    if '★' in lst:
        return False
    else:
        return True

medium_articles['temp'] = medium_articles['tag_list'].apply(clean)
medium_articles = medium_articles.loc[medium_articles.temp].reset_index(inplace=False)
medium_articles = medium_articles.drop(['temp'], axis = 1)
copy = medium_articles.copy()
medium_articles

Unnamed: 0,index,title,article_url,claps,reading_time,date,tag_list
0,0,Top 10 Technology Trends for 2020,https://towardsdatascience.com/top-10-technolo...,3000,10,2020-01-03,"['Technology', 'Trends', 'Artificial Intellige..."
1,1,Top 10 Skills for a Data Scientist,https://towardsdatascience.com/top-10-skills-f...,2200,9,2020-01-03,"['Data Science', 'Technology', 'Business', 'Ma..."
2,2,ML Ops: Machine Learning as an Engineering Dis...,https://towardsdatascience.com/ml-ops-machine-...,1300,10,2020-01-03,"['Data Science', 'Machine Learning', 'Data Eng..."
3,3,Organizing your Python Code,https://medium.com/@k3no/organizing-your-pytho...,1200,13,2020-01-03,"['Python', 'Programming', 'Data Science', 'Cod..."
4,4,How to be fancy with OOP in Python,https://towardsdatascience.com/how-to-be-fancy...,928,3,2020-01-03,"['Programming', 'Python', 'Data Science', 'Cod..."
...,...,...,...,...,...,...,...
11730,16044,3 สิ่งน่าสนใจจาก Big data จากโครงการ “ชิมช้อปใช้”,https://medium.com/achieve-space/3-%E0%B8%AA%E...,0,2,2020-08-19,['Data Science']
11731,16045,Top Master Data Science Institute in Delhi — C...,https://medium.com/@sunnynsa2019/top-master-da...,1,3,2020-08-19,"['Master Data Science', 'Data Science']"
11732,16046,"Data Science and Untapped Possibilities, How M...",https://medium.com/@MaxEd_Blog/data-science-an...,0,3,2020-08-19,"['Big Data', 'Data Science', 'Data Analytics',..."
11733,16047,Data Scientist เก่งคิด เก่งพูด นักวิทยาศาสตร์ข...,https://medium.com/achieve-space/data-scientis...,0,2,2020-08-19,"['Persuasive', 'Presentations', 'Data Science']"


In [484]:
tag_dict = {} # represents the amount of times a tag has shown up amongst all the articles

def tags(lst):
    """Gets the number of different times a tag was mentioned"""
    lst = ast.literal_eval(lst) # convert string list into actual list
    for tag in lst: 
        if tag in tag_dict.keys():
            tag_dict[tag] += 1
        else:
            tag_dict[tag] = 1
            
medium_articles['tag_list'].apply(tags)

tag_dict2

{'Technology': 453,
 'Trends': 23,
 'Artificial Intelligence': 1714,
 'Data Science': 11735,
 'Future': 21,
 'Business': 134,
 'Machine Learning': 4040,
 'Data': 1077,
 'Data Engineering': 172,
 'DevOps': 39,
 'Towards Data Science': 173,
 'Python': 1748,
 'Programming': 421,
 'Coding': 82,
 'Software Development': 117,
 'Statistics': 540,
 'Analytics': 571,
 'Experiment': 10,
 'Gojek': 2,
 'GIS': 12,
 'Geography': 8,
 'Data Analysis': 630,
 'Project Management': 20,
 'Construction': 2,
 'Project Controls': 1,
 'Megaprojects': 1,
 'Kaggle': 90,
 'Hackathons': 13,
 'Ensemble': 3,
 'Performance Marketing': 1,
 'Martech': 2,
 'Media Mix Modeling': 1,
 'NFL': 9,
 'Super Bowl': 4,
 'Predictions': 32,
 'NLP': 343,
 'Product Science': 1,
 'Autocorrelation': 2,
 'Partial Autocorrelation': 1,
 'Covariance Matrix': 1,
 'Time Series Analysis': 51,
 'Web Scraping': 105,
 'R Shiny': 4,
 'Rstudio': 37,
 'Data Enthusiast': 3,
 'K Means Clustering': 25,
 'Cluster Analysis': 5,
 'JavaScript': 38,
 'Nua

In [486]:
# sort the amount of time an article shows up in descending order
sort_orders = sorted(tag_dict.items(), key=lambda x: x[1], reverse=True)
sort_orders = sort_orders[1:] # remove the 'Data Science' tag since it shows up in every article
sort_orders

[('Machine Learning', 4040),
 ('Python', 1748),
 ('Artificial Intelligence', 1714),
 ('Data Visualization', 1184),
 ('Data', 1077),
 ('Deep Learning', 812),
 ('Covid 19', 637),
 ('Data Analysis', 630),
 ('Analytics', 571),
 ('AI', 570),
 ('Statistics', 540),
 ('Big Data', 464),
 ('Technology', 453),
 ('Programming', 421),
 ('NLP', 343),
 ('Coronavirus', 320),
 ('Data Analytics', 301),
 ('Astrology', 241),
 ('Dailies', 228),
 ('Pandas', 224),
 ('Data Scientist', 206),
 ('Art', 201),
 ('Towards Data Science', 173),
 ('Data Engineering', 172),
 ('Neural Networks', 160),
 ('Data Science Training', 139),
 ('R', 138),
 ('Business', 134),
 ('Advertising', 134),
 ('Airbnb', 131),
 ('Dating', 130),
 ('Business Intelligence', 128),
 ('Computer Vision', 125),
 ('Mathematics', 119),
 ('Education', 118),
 ('Software Development', 117),
 ('Computer Science', 117),
 ('Python Programming', 115),
 ('Naturallanguageprocessing', 114),
 ('Visualization', 113),
 ('Algorithms', 113),
 ('Database', 109),
 ('

In [487]:
# get top 100 tags 
counter = Counter(sort_orders)
most_common = counter.most_common(100)
most_common = [x[0] for x in most_common]

most_common_dict = {}

for tag in most_common:
    most_common_dict[tag[0]] = tag[1]

most_common_dict       

{'Machine Learning': 4040,
 'Python': 1748,
 'Artificial Intelligence': 1714,
 'Data Visualization': 1184,
 'Data': 1077,
 'Deep Learning': 812,
 'Covid 19': 637,
 'Data Analysis': 630,
 'Analytics': 571,
 'AI': 570,
 'Statistics': 540,
 'Big Data': 464,
 'Technology': 453,
 'Programming': 421,
 'NLP': 343,
 'Coronavirus': 320,
 'Data Analytics': 301,
 'Astrology': 241,
 'Dailies': 228,
 'Pandas': 224,
 'Data Scientist': 206,
 'Art': 201,
 'Towards Data Science': 173,
 'Data Engineering': 172,
 'Neural Networks': 160,
 'Data Science Training': 139,
 'R': 138,
 'Business': 134,
 'Advertising': 134,
 'Airbnb': 131,
 'Dating': 130,
 'Business Intelligence': 128,
 'Computer Vision': 125,
 'Mathematics': 119,
 'Education': 118,
 'Software Development': 117,
 'Computer Science': 117,
 'Python Programming': 115,
 'Naturallanguageprocessing': 114,
 'Visualization': 113,
 'Algorithms': 113,
 'Database': 109,
 'Web Scraping': 105,
 'Jupyter Notebook': 103,
 'Linear Regression': 103,
 'Travel': 1

In [488]:
# check if top 100 tags in article

def top_100(lst):
    """Checks if the top 100 tags mentioned are mentioned in an article"""
    lst = ast.literal_eval(lst)
    for i in lst:
        if i in most_common_dict.keys():
            return True
    return False

medium_articles['is_top100'] = medium_articles['tag_list'].apply(top_100)  
medium_articles = medium_articles.loc[medium_articles.is_top100]
copy_info = medium_articles.copy()


# replaces True with the tags that are in the top 100

def specific_tags(lst):
    lst = ast.literal_eval(lst)
    tags = []
    for i in lst:
        if i in most_common_dict.keys():
            tags.append(i)
    return tags
        
medium_articles['is_top100'] = medium_articles['tag_list'].apply(specific_tags)
medium_articles.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medium_articles['is_top100'] = medium_articles['tag_list'].apply(specific_tags)


Unnamed: 0,index,title,article_url,claps,reading_time,date,tag_list,is_top100
0,0,Top 10 Technology Trends for 2020,https://towardsdatascience.com/top-10-technolo...,3000,10,2020-01-03,"['Technology', 'Trends', 'Artificial Intellige...","[Technology, Artificial Intelligence]"
1,1,Top 10 Skills for a Data Scientist,https://towardsdatascience.com/top-10-skills-f...,2200,9,2020-01-03,"['Data Science', 'Technology', 'Business', 'Ma...","[Technology, Business, Machine Learning, Data]"
2,2,ML Ops: Machine Learning as an Engineering Dis...,https://towardsdatascience.com/ml-ops-machine-...,1300,10,2020-01-03,"['Data Science', 'Machine Learning', 'Data Eng...","[Machine Learning, Data Engineering, Towards D..."
3,3,Organizing your Python Code,https://medium.com/@k3no/organizing-your-pytho...,1200,13,2020-01-03,"['Python', 'Programming', 'Data Science', 'Cod...","[Python, Programming, Coding, Software Develop..."
4,4,How to be fancy with OOP in Python,https://towardsdatascience.com/how-to-be-fancy...,928,3,2020-01-03,"['Programming', 'Python', 'Data Science', 'Cod...","[Programming, Python, Coding]"


In [489]:
final_df = pd.DataFrame()
final_df['tag'] = list(most_common_dict.keys())
final_df['mentions'] = list(most_common_dict.values())

final_claps = {}

for i in most_common_dict.keys():
    final_claps[i] = 0

final_readtime = {}

for i in most_common_dict.keys():
    final_readtime[i] = 0
    
def claps(x):
    for i in x[7]:
        final_claps[i] += (x[3])

def read_time(x):
    for i in x[7]:
        final_readtime[i] += (x[4])        
    
medium_articles.apply(claps, axis=1)
medium_articles.apply(read_time, axis=1)

final_df['avg_claps'] = list(final_claps.values())
final_df['avg_readtime'] = list(final_readtime.values())
final_df['avg_claps'] = final_df['avg_claps']/final_df['mentions']
final_df['avg_readtime'] = final_df['avg_readtime']/final_df['mentions']

final_df.head()

Unnamed: 0,tag,mentions,avg_claps,avg_readtime
0,Machine Learning,4040,71.564604,5.478713
1,Python,1748,63.665332,5.215675
2,Artificial Intelligence,1714,103.471412,4.92182
3,Data Visualization,1184,37.119932,5.107264
4,Data,1077,43.841226,4.55896


In [297]:
final_df.to_csv('article_stats.csv', header=True, index = False)

In [490]:
# grabs the articles that contain top 100 tags
copy_info.head()

Unnamed: 0,index,title,article_url,claps,reading_time,date,tag_list,is_top100
0,0,Top 10 Technology Trends for 2020,https://towardsdatascience.com/top-10-technolo...,3000,10,2020-01-03,"['Technology', 'Trends', 'Artificial Intellige...",True
1,1,Top 10 Skills for a Data Scientist,https://towardsdatascience.com/top-10-skills-f...,2200,9,2020-01-03,"['Data Science', 'Technology', 'Business', 'Ma...",True
2,2,ML Ops: Machine Learning as an Engineering Dis...,https://towardsdatascience.com/ml-ops-machine-...,1300,10,2020-01-03,"['Data Science', 'Machine Learning', 'Data Eng...",True
3,3,Organizing your Python Code,https://medium.com/@k3no/organizing-your-pytho...,1200,13,2020-01-03,"['Python', 'Programming', 'Data Science', 'Cod...",True
4,4,How to be fancy with OOP in Python,https://towardsdatascience.com/how-to-be-fancy...,928,3,2020-01-03,"['Programming', 'Python', 'Data Science', 'Cod...",True


In [491]:
# get rid of english stop words
stop = stopwords.words('english')
copy_info['title'] = copy_info['title'].str.lower().str.split()
copy_info['title'] = copy_info['title'].apply(lambda x: [item for item in x if item not in stop])

# get rid of punctuation
def remove_punc(lst):
    return list(filter(lambda x:x, map(lambda x:re.sub(r'[^A-Za-z]', '', x), lst)))
copy_info['title'] = copy_info['title'].apply(remove_punc)

# combine the list of strings again
def combine(lst):
    return " ".join(lst)
copy_info['title'] = copy_info['title'].apply(combine)

words = set(nltk.corpus.words.words())
# remove non-english words
def remove_non_english(string):
    return " ".join(w for w in nltk.wordpunct_tokenize(string) if w.lower() in words or not w.isalpha())
copy_info['title'] = copy_info['title'].apply(remove_non_english)

# remove words that are less than or equal to 2 letters
def remove_two_letter(string):
    shortword = re.compile(r'\W*\b\w{1,3}\b')
    return shortword.sub('', string)
copy_info['title'] = copy_info['title'].apply(remove_two_letter)

def manual_removal(string):
    """Removed strings affiliated with spam articles"""
    shortword = re.compile(r'baba|amil|kala|bibi|black|magic')
    return shortword.sub('', string)
copy_info['title'] = copy_info['title'].apply(manual_removal)

copy_info['is_top100'] = medium_articles['is_top100']

copy_info.head()

Unnamed: 0,index,title,article_url,claps,reading_time,date,tag_list,is_top100
0,0,technology,https://towardsdatascience.com/top-10-technolo...,3000,10,2020-01-03,"['Technology', 'Trends', 'Artificial Intellige...","[Technology, Artificial Intelligence]"
1,1,data scientist,https://towardsdatascience.com/top-10-skills-f...,2200,9,2020-01-03,"['Data Science', 'Technology', 'Business', 'Ma...","[Technology, Business, Machine Learning, Data]"
2,2,machine learning engineering discipline,https://towardsdatascience.com/ml-ops-machine-...,1300,10,2020-01-03,"['Data Science', 'Machine Learning', 'Data Eng...","[Machine Learning, Data Engineering, Towards D..."
3,3,python code,https://medium.com/@k3no/organizing-your-pytho...,1200,13,2020-01-03,"['Python', 'Programming', 'Data Science', 'Cod...","[Python, Programming, Coding, Software Develop..."
4,4,fancy python,https://towardsdatascience.com/how-to-be-fancy...,928,3,2020-01-03,"['Programming', 'Python', 'Data Science', 'Cod...","[Programming, Python, Coding]"


In [492]:
# grabs top 100 siginificant words in the titles of articles that contain top 100 tags
top_100_titles = Counter(" ".join(copy_info["title"]).split()).most_common(100)
top_100_titles

[('data', 2966),
 ('science', 1198),
 ('learning', 1007),
 ('machine', 785),
 ('python', 723),
 ('strong', 706),
 ('covid', 566),
 ('analysis', 403),
 ('part', 392),
 ('analytics', 246),
 ('model', 206),
 ('scientist', 202),
 ('regression', 174),
 ('specialist', 160),
 ('deep', 156),
 ('introduction', 154),
 ('best', 147),
 ('guide', 137),
 ('world', 136),
 ('intelligence', 136),
 ('neural', 127),
 ('artificial', 124),
 ('classification', 119),
 ('linear', 118),
 ('business', 117),
 ('visualization', 114),
 ('project', 112),
 ('learn', 111),
 ('know', 103),
 ('real', 103),
 ('understanding', 100),
 ('expert', 96),
 ('prediction', 95),
 ('time', 90),
 ('need', 90),
 ('building', 88),
 ('detection', 83),
 ('language', 82),
 ('series', 81),
 ('first', 80),
 ('customer', 75),
 ('canada', 75),
 ('famous', 75),
 ('career', 74),
 ('feature', 73),
 ('image', 73),
 ('engineering', 72),
 ('clustering', 71),
 ('algorithm', 71),
 ('network', 70),
 ('statistics', 70),
 ('exploratory', 69),
 ('scrap

In [493]:
# convert the list above into a dataframe to enter into Tableau
top_100_titles_df_main = pd.DataFrame(top_100_titles, columns = ['word', 'occurrence'])
top_100_titles_df_main.head()

Unnamed: 0,word,occurrence
0,data,2966
1,science,1198
2,learning,1007
3,machine,785
4,python,723


In [494]:
count = 0

def top_tag(lst, tag):
    """Checks if the top 100 tags mentioned are mentioned in an article"""
    lst = ast.literal_eval(lst)
    for i in lst:
        if i == tag:
            return True
    return False

for tag in list(most_common_dict.keys()):
    medium_articles['is_top_tag'] = medium_articles.apply(lambda x: top_tag(x['tag_list'], tag), axis=1)  
    medium_articles_copy = medium_articles.loc[medium_articles.is_top_tag]
    
    #clean the titles
    medium_articles_copy['title'] = medium_articles_copy['title'].str.lower().str.split()
    medium_articles_copy['title'] = medium_articles_copy['title'].apply(lambda x: [item for item in x if item not in stop])
    
    medium_articles_copy['title'] = medium_articles_copy['title'].apply(remove_punc)

    medium_articles_copy['title'] = medium_articles_copy['title'].apply(combine)

    medium_articles_copy['title'] = medium_articles_copy['title'].apply(remove_non_english)

    medium_articles_copy['title'] = medium_articles_copy['title'].apply(remove_two_letter)

    medium_articles_copy['title'] = medium_articles_copy['title'].apply(manual_removal)
    
    top_100_titles = Counter(" ".join(medium_articles_copy["title"]).split()).most_common(20)
    
    if count == 0:
        top_100_titles_df = pd.DataFrame(top_100_titles, columns = ['word', 'occurrence'])
        top_100_titles_df['tag'] = tag
        top_100_titles_df['rate'] = top_100_titles_df['occurrence']/list(most_common_dict.values())[count]
        top_100_titles_df['rate'] = top_100_titles_df['rate'] * 100
    else:
        top_100_titles_df_2 = pd.DataFrame(top_100_titles, columns = ['word', 'occurrence'])
        top_100_titles_df_2['tag'] = tag
        top_100_titles_df_2['rate'] = top_100_titles_df_2['occurrence']/list(most_common_dict.values())[count]
        top_100_titles_df_2['rate'] = top_100_titles_df_2['rate'] * 100
        top_100_titles_df = pd.concat([top_100_titles_df, top_100_titles_df_2], ignore_index=True, sort=False)
    count+=1
        
top_100_titles_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medium_articles['is_top_tag'] = medium_articles.apply(lambda x: top_tag(x['tag_list'], tag), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medium_articles_copy['title'] = medium_articles_copy['title'].str.lower().str.split()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medium_articles_co

Unnamed: 0,word,occurrence,tag,rate
0,data,957,Machine Learning,23.688119
1,learning,861,Machine Learning,21.311881
2,machine,732,Machine Learning,18.118812
3,science,471,Machine Learning,11.658416
4,strong,250,Machine Learning,6.188119
...,...,...,...,...
1995,best,1,Career Advice,2.000000
1996,soft,1,Career Advice,2.000000
1997,skill,1,Career Advice,2.000000
1998,without,1,Career Advice,2.000000


In [497]:
len(top_100_titles_df.groupby('word', as_index=False).sum()[['word', 'occurrence']])

484

In [496]:
top_100_titles_df.to_csv('top_title_words_w_tag_4.csv')