In [None]:
import re
import unicodedata
import pandas as pd
import numpy as np
import nltk
import os
import nltk.corpus
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [None]:
df = pd.read_csv("/content/data.csv")

In [None]:
df.columns

Index(['index', 'fullname', 'name', 'description', 'gh_readme', 'author',
       'stargazers_count', 'watchers_count', 'forks_count', 'topics'],
      dtype='object')

In [None]:
df["name_description"] = df['name'] + ' ' + df['description']

In [None]:
df["name_description"]

0                               np A better `npm publish`
1       awesome-micro-npm-packages A curated list of s...
2              awesome-npm Awesome npm resources and tips
3       npm-home Open the npm page, Yarn page, or GitH...
4                   sinopia Private npm repository server
                              ...                        
1059    dotnet-version-cli dotnet version cli (similar...
1060    cut-release A tool that helps you make faster ...
1061        npm-utils node security project npm utilities
1062    07-node-npm Let's learn how to integrate Node,...
1063    web3-auth NPM for signing into Express apps us...
Name: name_description, Length: 1064, dtype: object

In [None]:
df['topics'] = df['topics'].apply(lambda x : x[1:-1][1:-1].split("', '"))

In [None]:
df

Unnamed: 0,index,fullname,name,description,gh_readme,author,stargazers_count,watchers_count,forks_count,topics,name_description
0,5,sindresorhus/np,np,A better `npm publish`,,sindresorhus,6922,6922,326,"[cli, cli-app, javascript, nodejs, npm, npm-pa...",np A better `npm publish`
1,12,parro-it/awesome-micro-npm-packages,awesome-micro-npm-packages,"A curated list of small, focused npm packages.",,parro-it,4388,4388,458,[],awesome-micro-npm-packages A curated list of s...
2,13,sindresorhus/awesome-npm,awesome-npm,Awesome npm resources and tips,,sindresorhus,4149,4149,261,"[awesome, awesome-list, list, node, nodejs, np...",awesome-npm Awesome npm resources and tips
3,343,sindresorhus/npm-home,npm-home,"Open the npm page, Yarn page, or GitHub repo o...",,sindresorhus,180,180,9,[],"npm-home Open the npm page, Yarn page, or GitH..."
4,4,rlidwka/sinopia,sinopia,Private npm repository server,`sinopia` - a private/caching npm repository s...,rlidwka,5482,5482,690,[],sinopia Private npm repository server
...,...,...,...,...,...,...,...,...,...,...,...
1059,2992,skarpdev/dotnet-version-cli,dotnet-version-cli,dotnet version cli (similar to npm version cli),[![Build status](https://ci.appveyor.com/api/p...,skarpdev,34,34,10,"[cli, dotnet, dotnet-core, patch, tool, versio...",dotnet-version-cli dotnet version cli (similar...
1060,2994,bjoerge/cut-release,cut-release,A tool that helps you make faster npm releases,# cut-release\n\nA command line tool that help...,bjoerge,108,108,6,[],cut-release A tool that helps you make faster ...
1061,2995,nodesecurity/npm-utils,npm-utils,node security project npm utilities,# node security project npm utilities\n\n## Me...,nodesecurity,11,11,7,[],npm-utils node security project npm utilities
1062,2998,amphib24/07-node-npm,07-node-npm,"Let's learn how to integrate Node, npm, and an...",## ![CF](https://i.imgur.com/7v5ASc8.png) Lab ...,amphib24,0,0,21,[],"07-node-npm Let's learn how to integrate Node,..."


In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('words')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
corpus = set(stopwords.words('english'))
from nltk.corpus import words

In [None]:
def remove_code_blocks(string):
    list_block = string.split('```')
    new_list_block = []
    for index, item in enumerate(list_block):
        if index % 2 == 0:
            new_list_block.append(item)
    return ' '.join(new_list_block)

In [None]:
def nlp_basic_clean(string):
    """
    Lowercases, removes non-ASCII characters, and removes non-alphanumeric (except ' or \s') from the passed in string.
    """
    
    cleaned_string = string
    
    cleaned_string = cleaned_string.lower()
    cleaned_string = unicodedata.normalize("NFKD", cleaned_string).encode("ascii", "ignore").decode("utf-8", "ignore")
    cleaned_string = re.sub('<[^<]+?>', ' ', cleaned_string)
    cleaned_string = re.sub(r"\n", " ", cleaned_string)
    cleaned_string = re.sub(r'http\S+', ' ', cleaned_string)
    cleaned_string = remove_code_blocks(cleaned_string)
    cleaned_string = re.sub(r"[^a-z'\s]", ' ', cleaned_string)
    cleaned_string = re.sub(r"\s\s+" , ' ', cleaned_string)
    cleaned_string = cleaned_string.strip()

    print(cleaned_string)
    return cleaned_string

In [None]:
def tokenize_text(string):
    token_list = word_tokenize(string)
    return token_list

In [None]:
lemmatizer = WordNetLemmatizer() 

def text_lemmatizing(val):
    datas = val
    for index, data in enumerate(datas):
        datas[index] = lemmatizer.lemmatize(data)
    return datas


In [None]:
def remove_stopwords(tokenize_list):
    tokenize_list = [x for x in tokenize_list if x not in corpus]
    return tokenize_list


In [None]:
from nltk.probability import FreqDist

def freq_text(token_list):
    fdist = FreqDist(token_list)
    return fdist

def freq_perform(fdist):
    print(max(fdist), max(fdist.values()))

    fdist.plot()

In [None]:

def remove_singleton_doubleton(tokenize_list, topics):
    return [ i for i in tokenize_list if i in topics or len(i) > 3 ]

def remove_non_english_word(tokenize_list):
    corpus_words = words.words()
    return [ i for i in tokenize_list if i in corpus_words ]


In [None]:
def get_final_clean_text(text, topics):

    if str(text) == 'nan':
      return ''

    clean_text = nlp_basic_clean(text)
    tokenize_list = tokenize_text(clean_text)
    tokenize_list = text_lemmatizing(tokenize_list)
    tokenize_list = remove_stopwords(tokenize_list)
    tokenize_list = remove_singleton_doubleton(tokenize_list, topics)

    tokenize_list = remove_non_english_word(tokenize_list)

    final_clean_text = ' '.join(tokenize_list)
    return final_clean_text



In [None]:
df['gh_read_me_train'] = df.apply(lambda x: get_final_clean_text(x.gh_readme, x.topics) , axis = 1)

rhd front end code and documentation live documentation site installation scripts and contributing clone the repository ask for the font awesome license text for the npmrc file ask for the alternate registry information for the npmrc file if building the repo inside of red hat if running the alternate registry and font awesome you will need to set npm config set strict ssl to false npm config set strict ssl false without that font awesome will try to use the alternate registry for installation which will not work run to install npm tracked dependencies locally install go make go built executables accessible add the go bin to path find by running go env and it would be gopath bin this is a necessary step to successfully execute publish gh pages sh or review gh pages sh alternatively you can just run hugo commands with go bin hugo run go get github com gohugoio hugo gets and builds the latest hugo release mac users run cd go src github com gohugoio hugo go install tags extended to ensure

In [None]:
df['name_description_train'] = df.apply(lambda x: get_final_clean_text(x.name_description, x.topics) , axis = 1)

np a better npm publish
awesome micro npm packages a curated list of small focused npm packages
awesome npm awesome npm resources and tips
npm home open the npm page yarn page or github repo of a package
sinopia private npm repository server
npm expansions send us a pull request by editing expansions txt
npm ship semantic release plugin to publish a npm package
npm complete npm integration for meteor
npm check check for outdated incorrect and unused dependencies
bitandbang my npm card
simple node js react npm app for an introductory tutorial on how to use jenkins to build a simple node js and react application with npm
frontend maven plugin maven node grunt gulp npm node plugin to end all maven node grunt gulp npm plugins a maven plugin that downloads installs node and npm locally runs npm install grunt gulp and or karma
npm check updates find newer versions of package dependencies than what your package json allows
phantomjs npm wrapper for installing phantomjs
license checker check n

In [None]:
df['gh_read_me_train']

0                                                        
1                                                        
2                                                        
3                                                        
4       sinopia private repository server version badg...
                              ...                        
1059    build status version image sonar quality code ...
1060    release command line tool help make faster rel...
1061    node security project utility method function ...
1062    today introduction server side development nod...
1063    express note request prevent attack work envir...
Name: gh_read_me_train, Length: 1064, dtype: object

In [None]:
df['name_description_train']

0                                 better publish
1       awesome micro package list small package
2                       awesome awesome resource
3               home open page yarn page package
4              sinopia private repository server
                          ...                   
1059             version version similar version
1060       release tool help make faster release
1061               node security project utility
1062    node learn integrate node express server
1063                                     express
Name: name_description_train, Length: 1064, dtype: object

TF - term frequency
IDF - inverse document frequency

{
a1 : 0.44,
a2: 0.23
...
a1000: 0.887
}

Doc1: [a1, a3, a56]  => [TF_IDF của a1, 0,  ,.....,.. ] có 1000 ký tự 1000x1 

=====================



[100 x 100]


      doc1 doc2 doc3 ... dco100
doc1   1
doc2
doc3
...



In [None]:
tf_name_des = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix_name_des = tf_name_des.fit_transform(df['name_description_train'])

tf_readme = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix_readme = tf_readme.fit_transform(df['gh_read_me_train'])

In [None]:
cosine_sim_name_des = cosine_similarity(tfidf_matrix_name_des, tfidf_matrix_name_des)
cosine_sim_read_me = cosine_similarity(tfidf_matrix_readme, tfidf_matrix_readme)

In [None]:
cosine_sim_name_des

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.20627065, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.20627065, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.06222126,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.06222126, 1.        ,
        0.29373973],
       [0.        , 0.        , 0.        , ..., 0.        , 0.29373973,
        1.        ]])

In [None]:
cosine_sim_read_me

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.0252908 ,
        0.00226725],
       [0.        , 0.        , 0.        , ..., 0.0252908 , 1.        ,
        0.01032665],
       [0.        , 0.        , 0.        , ..., 0.00226725, 0.01032665,
        1.        ]])

In [None]:
indices = pd.Series(df.index, index=df['index'])

In [None]:
indices

index
5          0
12         1
13         2
343        3
4          4
        ... 
2992    1059
2994    1060
2995    1061
2998    1062
2999    1063
Length: 1064, dtype: int64

In [None]:
def improved_recommendations_2(cosine_sim, title_idx, return_count = 10):
    idx = indices[title_idx]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1: return_count + 1]
    package_indices = [i[0] for i in sim_scores]
    return package_indices

In [None]:
import pickle
pickle.dump(tfidf_matrix_name_des, open('tfidf_name_des.pickle', 'wb'))
pickle.dump(tf_name_des, open('tfidf_vectorizer_name_des.pickle', 'wb'))

pickle.dump(tfidf_matrix_readme, open('tfidf_readme.pickle', 'wb'))
pickle.dump(tf_readme, open('tfidf_vectorizer_readme.pickle', 'wb'))

In [None]:
tfidf_name_des = None
with (open("/content/tfidf_name_des.pickle", "rb")) as openfile:
    while True:
        try:
            tfidf_name_des = pickle.load(openfile)
        except EOFError:
            break

tfidf_readme = None
with (open("/content/tfidf_readme.pickle", "rb")) as openfile:
    while True:
        try:
            tfidf_readme = pickle.load(openfile)
        except EOFError:
            break

In [None]:
cosine_sim_name_des = cosine_similarity(tfidf_name_des, tfidf_name_des)
cosine_sim_readme = cosine_similarity(tfidf_readme, tfidf_readme)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import pandas as pd

# Khi server mở thì đọc file để load model:
def load_model():
  tfidf_name_des = None
  with (open("/content/tfidf_name_des.pickle", "rb")) as openfile:
      while True:
          try:
              tfidf_name_des = pickle.load(openfile)
          except EOFError:
              break

  tfidf_readme = None
  with (open("/content/tfidf_readme.pickle", "rb")) as openfile:
      while True:
          try:
              tfidf_readme = pickle.load(openfile)
          except EOFError:
              break
  
  return tfidf_name_des, tfidf_readme

tfidf_name_des, tfidf_readme = load_model()
cosine_sim_name_des = cosine_similarity(tfidf_name_des, tfidf_name_des)
cosine_sim_readme = cosine_similarity(tfidf_readme, tfidf_readme)

df = pd.read_csv("/content/data.csv")
indices = pd.Series(df.index, index=df['index'])

# Mỗi lần request get detail của 1 package, get từ database, lấy field Index của package đó ra và truyền vào title_idx

'''
- filter_by : chọn loại để recoomend, ví dụ 'readme' (default), 'name_des'
- return_count : số lượng trả về
'''
def improved_recommendations_2(title_idx, filter_by = 'readme', return_count = 10):
    idx = indices[title_idx]
    sim_scores = None 
    if filter_by == 'readme':
      sim_scores = list(enumerate(cosine_sim_readme[idx]))
    elif filter_by == 'name_des':
      sim_scores = list(enumerate(cosine_sim_name_des[idx]))
    
    assert sim_scores != None, 'score list can not be Null'
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1: return_count + 1]
    package_indices = [i[0] for i in sim_scores]
    return package_indices

In [None]:
print(improved_recommendations_2(795, 'readme', 10))
print(improved_recommendations_2(795, 'name_des' , 10))

[6, 197, 912, 76, 941, 664, 689, 226, 932, 205]
[588, 551, 1057, 214, 728, 945, 60, 695, 808, 371]
