In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(r'C:\Users\ssayc\HandsOnRec\data\projectinfo.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 31 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   fullname             3000 non-null   object 
 1   analyzed             3000 non-null   object 
 2   cloneurl             3000 non-null   object 
 3   commits_count        3000 non-null   int64  
 4   created_at           3000 non-null   object 
 5   default_branch       3000 non-null   object 
 6   description          2968 non-null   object 
 7   errormsg             677 non-null    object 
 8   error_commits_count  3000 non-null   int64  
 9   filtered             3000 non-null   object 
 10  filterinfo           677 non-null    object 
 11  fork                 3000 non-null   object 
 12  forks_count          3000 non-null   int64  
 13  homepage             1778 non-null   object 
 14  language             3000 non-null   object 
 15  lastcommit           2987 non-null   o

In [4]:
projects = df[['name', 'numauthors', 'tf', 'open_issues', 'forks_count', 'numfiles', 'mainlanguage', 'description']]

In [5]:
projects['name'].astype("|S")

0       b'Android-Bootstrap'
1            b'mediaelement'
2             b'redis-store'
3                b'magento2'
4               b'spinnaker'
                ...         
2995               b'mlpack'
2996          b'cleanflight'
2997        b'awesome_print'
2998                 b'xiki'
2999    b'devise_token_auth'
Name: name, Length: 3000, dtype: bytes328

In [6]:
projects.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          2999 non-null   object 
 1   numauthors    3000 non-null   int64  
 2   tf            2992 non-null   float64
 3   open_issues   3000 non-null   int64  
 4   forks_count   3000 non-null   int64  
 5   numfiles      3000 non-null   int64  
 6   mainlanguage  2998 non-null   object 
 7   description   2968 non-null   object 
dtypes: float64(1), int64(4), object(3)
memory usage: 152.4+ KB


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf = TfidfVectorizer(stop_words='english')

In [9]:
projects['description'] = projects['description'].fillna('')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  projects['description'] = projects['description'].fillna('')


In [10]:
#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix = tfidf.fit_transform(projects['description'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(3000, 5610)

In [11]:
# Import linear_kernel to compute the dot product
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [12]:
#Construct a reverse mapping of indices and movie titles, and drop duplicate titles, if any
indices = pd.Series(projects.index, index=projects['name']).drop_duplicates()

In [13]:
# Function that takes in project name as input and gives recommendations 
def content_recommender(name, cosine_sim=cosine_sim, df=projects, indices=indices):
    # Obtain the index of the project that matches the name
    idx = indices[name]

    # Get the pairwsie similarity scores of all projects with that project
    # And convert it into a list of tuples as described above
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the projects based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar projects. Ignore the first one.
    sim_scores = sim_scores[1:11]

    # Get the project indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar projects
    return df['name'].iloc[movie_indices]

In [14]:
projects['name'].tail(50)

2950    react-redux-universal-hot-example
2951                               portia
2952                              vue-cli
2953                            dropplets
2954                              Buttons
2955                       openFrameworks
2956                             or-tools
2957                    Android-Week-View
2958                      capybara-webkit
2959                          vagrant-aws
2960                                  bfs
2961                              Lasagne
2962                                jinja
2963                              laravel
2964                                   v8
2965                           TimelineJS
2966                           searchable
2967                         imagesloaded
2968               recyclerview-animators
2969                          node-canvas
2970                               statsd
2971                           youtube-dl
2972                               uBlock
2973                    DesignPatt

In [22]:
content_recommender('zulip')

2017          groupdate
276        mysql-server
1907        LittleProxy
526              botman
2252        Rocket.Chat
2244          lets-chat
2980          rethinkdb
2467             unmark
187     graylog2-server
919            webshell
Name: name, dtype: object