In [1]:
# Preparation
from acquire_msc import get_all_readme_files_and_languages
from acquire_msc import get_alt
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

In [2]:
url_list = ['microsoft/Terminal', 'jackfrued/Python-100-Days', 
            'kkuchta/css-only-chat', 'microsoft/PowerToys', 'jolaleye/cssfx', 
            'MisterBooo/LeetCodeAnimation', 'flutter/flutter_web', 
            'TheAlgorithms/Python', 'hiroppy/fusuma', 'CyC2018/CS-Notes', 
            'jaywcjlove/linux-command', 'flutter/flutter', '996icu/996.ICU', 
            'STVIR/pysot', 'minamarkham/formation', 'azl397985856/leetcode', 
            'qianguyihao/Web', 'react-native-windows', 'sql-machine-learning/sqlflow', 
            'sabakkps/backslide', 'dgryski/go-perfbook', 'Snailclimb/JavaGuide', 
            'microsoft/vscode', 'markphelps/flipt', 'teoga/awesome-product-design']

In [3]:
def basic_clean(article):
    '''
    take in a string (article) and return it after applying some basic text cleaning to it:
        - lowercase everything
        - normalize unicode characters
        - replace anything that is not a letter, number, whitespace or a single quote
    '''
    new_article = article.lower()
    new_article = re.sub(r'\s', ' ', new_article)
    normalized = unicodedata.normalize('NFKD', new_article)                .encode('ascii', 'ignore')                .decode('utf-8')
    without_special_chars = re.sub(r'[^\w\s]', ' ', normalized)
    word_list = without_special_chars.split()
    word_list = ' '.join(word_list)
    return word_list

In [4]:
def tokenize(article):
    '''tokenize all the words in the string, article'''
    tokenizer = nltk.tokenize.ToktokTokenizer()
    new_article = tokenizer.tokenize(article, return_str=True)
    return new_article

In [5]:
def print_stop_words(article):
    '''accept some text, apply stemming to all of the words,
        and print a list of value counts for all the stemmed words'''
    # Create the nltk stemmer object, then use it
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in article.split()]
    print(pd.Series(stems).value_counts())

In [6]:
def stem(article):
    '''accept a string and return it after applying stemming to all the words'''
    ps = nltk.stem.PorterStemmer()
    article_stemmed = ''.join([ps.stem(word) for word in article])
    return article_stemmed

In [7]:
def lemmatize(article):
    '''accept a string and return it after applying lemmatization to each word.'''
    wnl = nltk.stem.WordNetLemmatizer()
    lemmatized_words = [wnl.lemmatize(word) for word in article]
    article_lemmatized = ''.join(lemmatized_words)
    return article_lemmatized

In [8]:
def remove_stopwords(article, extra_words = None, exclude_words = None):
    '''remove all the stopwords, including all the words in extra_words and excluding
    all the words in exclude list'''

    # get basic stopword list
    stopword_list = stopwords.words('english')

    # add extra words    
    if extra_words != None:
        stopword_list = stopword_list + extra_words
    # remove excluded words
    if exclude_words != None:
        stopword_list = [word for word in stopword_list if word not in exclude_words]
    
    without_stopwords = [word for word in article.split(' ') if word not in stopword_list]
    article_without_stopwords = ' '.join(without_stopwords)
    return article_without_stopwords

In [9]:
def prep_repo_html(this_dict, extra_words = None, exclude_words = None):
    '''
    takes in a dictionary representing an article and returns a dictionary that 
    looks like this:
            {
             'title': 'the original title',
             'original': original,
             'clean': article_without_stopwords
            }
    Note that if the orignal dictionary has a title property, it will remain unchanged 
    (same goes for the category property).
    '''
    # put the content section into article and make a copy
    article = this_dict['content']
    original = article

    '''
    apply some basic text cleaning to the string, article:
        - lowercase everything
        - normalize unicode characters
        - replace anything that is not a letter, number, whitespace or a single quote
    '''
    article = basic_clean(article)

    '''tokenize all the words in the string, article'''
    article = tokenize(article)
    
    ''''apply lemmatization to each word in the string, article'''
    article = lemmatize(article)
    
    '''remove all the stopwords, including all the words in extra_words and excluding
    all the words in exclude list'''
    article = remove_stopwords(article, extra_words, exclude_words)

    keys = list(this_dict.keys())
    
    new_dict = {
         'title': this_dict['title'],
         'category': this_dict['category'] if 'category' in keys else 'repo_readme',        
         'original': original,
         'clean': article
        }
    return new_dict

In [10]:
def prepare_repo_html_data(articles, extra_words = None, exclude_words = None):
    # takes in the list of articles dictionaries, 
    # applies the prep_article function to each one, 
    # and returns the transformed data.
    transformed_articles = []
    for article_index in range(len(articles)):
        transformed_entry = prep_repo_html(articles[article_index], extra_words, exclude_words)
        transformed_articles.append(transformed_entry.copy())
        df = pd.DataFrame.from_dict(transformed_articles)

    return transformed_articles, df

In [11]:
if __name__ == "__main__":
    '''create a list of extra words and another of words to exclude from the stoplist'''
    extra_words = ['codeup']
    exclude_words = ['']

    articles = get_alt(url_list)
    transformed_data, df = prepare_repo_html_data(articles, extra_words, exclude_words)

In [15]:
transformed_data[0]['']

[{'title': 'microsoft/Terminal',
  'category': 'repo_readme',
  'original': 'Welcome! This repository contains the source code for:\n\nWindows Terminal\nThe Windows console host (conhost.exe)\nComponents shared between the two projects\nColorTool\nSample projects that show how to consume the Windows Console APIs\n\nBuild Status\n\n\n\nProject\nBuild Status\n\n\n\n\nTerminal\n\n\n\nColorTool\n\n\n\n\nTerminal & Console Overview\nPlease take a few minutes to review the overview below before diving into the code:\nWindows Terminal\nWindows Terminal is a new, modern, feature-rich, productive terminal application for command-line users. It includes many of the features most frequently requested by the Windows command-line community including support for tabs, rich text, globalization, configurability, theming & styling, and more.\nThe Terminal will also need to meet our goals and measures to ensure it remains fast, and efficient, and doesn\'t consume vast amounts of memory or power.\nThe Wi

In [12]:
# if __name__ == "__main__":
#     print(df)

In [13]:
# if __name__ == "__main__":
#     print(transformed_data)

In [14]:
# if __name__ == "__main__":
#     for index in range(len(transformed_data)):
#         print ('index:', index)
#         for key in transformed_data[index]:
#             print('key:', key)
#             print(transformed_data[index][key])
#             print('*******************')
#         print ('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')