In [15]:
import numpy as numpy
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

In [16]:
df = pd.read_csv("TopStaredRepositories.csv")

In [17]:
df.head()

Unnamed: 0,Username,Repository Name,Description,Last Update Date,Language,Number of Stars,Tags,Url
0,freeCodeCamp,freeCodeCamp,The https://freeCodeCamp.com open source codeb...,2017-06-24T15:56:17Z,JavaScript,290k,"nonprofits,certification,curriculum,react,node...",https://github.com/freeCodeCamp/freeCodeCamp
1,twbs,bootstrap,"The most popular HTML, CSS, and JavaScript fra...",2017-06-24T15:40:21Z,JavaScript,112k,"javascript,css,html,bootstrap,jekyll-site,scss",https://github.com/twbs/bootstrap
2,EbookFoundation,free-programming-books,Freely available programming books,2017-06-23T01:09:34Z,,87.8k,"education,list,books,resource",https://github.com/EbookFoundation/free-progra...
3,facebook,react,"A declarative, efficient, and flexible JavaScr...",2017-06-24T19:33:49Z,JavaScript,69.7k,,https://github.com/facebook/react
4,d3,d3,"Bring data to life with SVG, Canvas and HTML.",2017-05-31T06:03:47Z,JavaScript,65.7k,visualization,https://github.com/d3/d3


## Let us now remove unnecessary columns and only keep required columns

In [18]:
df.drop(["Last Update Date"], axis="columns", inplace = True)

In [19]:
df.head()

Unnamed: 0,Username,Repository Name,Description,Language,Number of Stars,Tags,Url
0,freeCodeCamp,freeCodeCamp,The https://freeCodeCamp.com open source codeb...,JavaScript,290k,"nonprofits,certification,curriculum,react,node...",https://github.com/freeCodeCamp/freeCodeCamp
1,twbs,bootstrap,"The most popular HTML, CSS, and JavaScript fra...",JavaScript,112k,"javascript,css,html,bootstrap,jekyll-site,scss",https://github.com/twbs/bootstrap
2,EbookFoundation,free-programming-books,Freely available programming books,,87.8k,"education,list,books,resource",https://github.com/EbookFoundation/free-progra...
3,facebook,react,"A declarative, efficient, and flexible JavaScr...",JavaScript,69.7k,,https://github.com/facebook/react
4,d3,d3,"Bring data to life with SVG, Canvas and HTML.",JavaScript,65.7k,visualization,https://github.com/d3/d3


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 980 entries, 0 to 979
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Username         980 non-null    object
 1   Repository Name  980 non-null    object
 2   Description      977 non-null    object
 3   Language         877 non-null    object
 4   Number of Stars  980 non-null    object
 5   Tags             489 non-null    object
 6   Url              980 non-null    object
dtypes: object(7)
memory usage: 53.7+ KB


In [21]:
encoder=LabelEncoder()
df["url_id"] = encoder.fit_transform(df.Url)

In [22]:
df.head()

Unnamed: 0,Username,Repository Name,Description,Language,Number of Stars,Tags,Url,url_id
0,freeCodeCamp,freeCodeCamp,The https://freeCodeCamp.com open source codeb...,JavaScript,290k,"nonprofits,certification,curriculum,react,node...",https://github.com/freeCodeCamp/freeCodeCamp,409
1,twbs,bootstrap,"The most popular HTML, CSS, and JavaScript fra...",JavaScript,112k,"javascript,css,html,bootstrap,jekyll-site,scss",https://github.com/twbs/bootstrap,922
2,EbookFoundation,free-programming-books,Freely available programming books,,87.8k,"education,list,books,resource",https://github.com/EbookFoundation/free-progra...,38
3,facebook,react,"A declarative, efficient, and flexible JavaScr...",JavaScript,69.7k,,https://github.com/facebook/react,385
4,d3,d3,"Bring data to life with SVG, Canvas and HTML.",JavaScript,65.7k,visualization,https://github.com/d3/d3,308


## Now, let us create a function to combine the tags, language and description

In [42]:
def combined_feature(data):
    features = []
    
    for i in range(0, data.shape[0]):
        
        language = str(data["Language"][i])
        tags = " ".join(str(data["Tags"][i]).split(","))
        
        res = language + " " + tags
        res = res.split()
        res = [i.lower() for i in res if i!="nan"]
        res = " ".join(res)
        
        features.append(res) 
        
    return features

In [43]:
df["combined"] = combined_feature(df)

In [44]:
df["combined"][400]

'javascript async-functions assert ava javascript test-runner tap concurrency babel es2015 nodejs cli async performance unicorns unit-testing cli-app node tdd testing test-framework'

In [46]:
cm = CountVectorizer().fit_transform(df["combined"])

In [48]:
cm.shape

(980, 1377)

In [49]:
cs = cosine_similarity(cm)

In [53]:
cs[8][:10]

array([0.34684399, 0.47809144, 0.        , 0.75592895, 0.53452248,
       0.30237158, 0.        , 0.        , 1.        , 0.75592895])

In [77]:
df.iloc[3]

Username                                                    facebook
Repository Name                                                react
Description        A declarative, efficient, and flexible JavaScr...
Language                                                  JavaScript
Number of Stars                                                69.7k
Tags                                                             NaN
Url                                https://github.com/facebook/react
url_id                                                           385
combined                                                  javascript
Name: 3, dtype: object

In [85]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SAKET\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [120]:
def cosine_similarity(user, current):
    
    user = set(user.split())
    current = set(current.split())
    
    vector = user.union(current)
    
    v1, v2 = [], []
    
    for i in vector:
        if i in user: v1.append(1)
        else: v1.append(0)
        
        if i in current: v2.append(1)
        else: v2.append(0)
    
    v1 = np.array(v1)
    v2 = np.array(v2)
    n = np.dot(v1, v2)
    d1 = np.linalg.norm(v1)
    d2 = np.linalg.norm(v2)
    d = d1*d2
    if d==0: return -0
    cosine = n/d
    return cosine

The ‘x’ vector has values, x = { 3, 2, 0, 5 }
The ‘y’ vector has values, y = { 1, 0, 0, 0 }

The formula for calculating the cosine similarity is : Cos(x, y) = x . y / ||x|| * ||y||

x . y = 3*1 + 2*0 + 0*0 + 5*0 = 3

||x|| = √ (3)^2 + (2)^2 + (0)^2 + (5)^2 = 6.16

||y|| = √ (1)^2 + (0)^2 + (0)^2 + (0)^2 = 1

∴ Cos(x, y) = 3 / (6.16 * 1) = 0.49 

In [125]:
def find_similar_repos(user, data):
    repos = {}
    for i in range(len(data)):
        s = data.iloc[i]
        current = s["combined"]
        res = cosine_similarity(user, current)
        
        repos[res] = [s["Repository Name"], s["Url"], s["Description"], s["Language"], s["Number of Stars"]]
    keys = sorted(list(repos.keys()), reverse=True)[:15]
    res = {}
    for i in keys: res[i] = repos[i]
    return res

In [126]:
user = "python java cpp"
find_similar_repos(user, df)

{0.5773502691896258: ['Calligraphy',
  'https://github.com/chrisjenx/Calligraphy',
  'Custom fonts in Android the easy way...',
  'Java',
  '6.4k'],
 0.47140452079103173: ['selenium',
  'https://github.com/SeleniumHQ/selenium',
  'A browser automation framework and ecosystem.',
  'JavaScript',
  '6.9k'],
 0.40824829046386296: ['infer',
  'https://github.com/facebook/infer',
  'A static analyzer for Java, C, C++, and Objective-C',
  'OCaml',
  '6.9k'],
 0.33333333333333337: ['pelican',
  'https://github.com/getpelican/pelican',
  'Static site generator that supports Markdown and reST syntax. Powered by Python.',
  'Python',
  '7k'],
 0.2886751345948129: ['agera',
  'https://github.com/google/agera',
  'Reactive Programming for Android',
  'Java',
  '6.6k'],
 0.2581988897471611: ['BottomBar',
  'https://github.com/roughike/BottomBar',
  'A custom view component that mimics the new Material Design Bottom Navigation pattern.',
  'Java',
  '6.5k'],
 0.23570226039551587: ['BaseRecyclerViewAd

## Let us augment the dataset into what we require, just to remove the clutter

In [127]:
df.head()

Unnamed: 0,Username,Repository Name,Description,Language,Number of Stars,Tags,Url,url_id,combined
0,freeCodeCamp,freeCodeCamp,The https://freeCodeCamp.com open source codeb...,JavaScript,290k,"nonprofits,certification,curriculum,react,node...",https://github.com/freeCodeCamp/freeCodeCamp,409,javascript nonprofits certification curriculum...
1,twbs,bootstrap,"The most popular HTML, CSS, and JavaScript fra...",JavaScript,112k,"javascript,css,html,bootstrap,jekyll-site,scss",https://github.com/twbs/bootstrap,922,javascript javascript css html bootstrap jekyl...
2,EbookFoundation,free-programming-books,Freely available programming books,,87.8k,"education,list,books,resource",https://github.com/EbookFoundation/free-progra...,38,education list books resource
3,facebook,react,"A declarative, efficient, and flexible JavaScr...",JavaScript,69.7k,,https://github.com/facebook/react,385,javascript
4,d3,d3,"Bring data to life with SVG, Canvas and HTML.",JavaScript,65.7k,visualization,https://github.com/d3/d3,308,javascript visualization


In [129]:
# s["Repository Name"], s["Url"], s["Description"], s["Language"], s["Number of Stars"]
new_df = df.drop(["Username", "Tags", "url_id"], axis="columns")

In [130]:
new_df.head()

Unnamed: 0,Repository Name,Description,Language,Number of Stars,Url,combined
0,freeCodeCamp,The https://freeCodeCamp.com open source codeb...,JavaScript,290k,https://github.com/freeCodeCamp/freeCodeCamp,javascript nonprofits certification curriculum...
1,bootstrap,"The most popular HTML, CSS, and JavaScript fra...",JavaScript,112k,https://github.com/twbs/bootstrap,javascript javascript css html bootstrap jekyl...
2,free-programming-books,Freely available programming books,,87.8k,https://github.com/EbookFoundation/free-progra...,education list books resource
3,react,"A declarative, efficient, and flexible JavaScr...",JavaScript,69.7k,https://github.com/facebook/react,javascript
4,d3,"Bring data to life with SVG, Canvas and HTML.",JavaScript,65.7k,https://github.com/d3/d3,javascript visualization


In [131]:
new_df.to_csv("augmented_data.csv")