In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
pwd

'/content'

In [75]:
# read CSVs
mentor = pd.read_csv(r"/content/mentor.csv", index_col=False)
mentor.head(3)

Unnamed: 0,Timestamp,Name,Identity,Domain,Experience,Company
0,9/17/2023 12:43:51,Harshi,10,Tech,3,XTech
1,9/17/2023 12:50:10,Kunal,11,Tech,5,Visa
2,9/17/2023 12:51:27,Alex,12,Finance,4,MoneyX


In [76]:
mentor.index

RangeIndex(start=0, stop=10, step=1)

In [77]:
mentor.shape

mentor.isnull().any()

Timestamp     False
Name          False
Identity      False
Domain        False
Experience    False
Company       False
dtype: bool

In [78]:
mentor['Domain'].value_counts()

Tech           3
Finance        2
Education      2
Robotics       2
Agriculture    1
Name: Domain, dtype: int64

In [79]:
#Data frame for user
data = pd.read_csv(r"/content/startup.csv",index_col = False)
data.head(1)

Unnamed: 0,Timestamp,Name,Identity,Domain,Experience
0,9/17/2023 12:45:20,CodeCollab,1,Tech,1


In [80]:
data.shape

(8, 5)

In [82]:
df1=pd.DataFrame(mentor, columns=['Identity','Name','Domain','Experience'])
df1.iloc[0:3]

Unnamed: 0,Identity,Name,Domain,Experience
0,10,Harshi,Tech,3
1,11,Kunal,Tech,5
2,12,Alex,Finance,4


In [83]:
df2=pd.DataFrame(data, columns=['Identity','Name','Domain','Experience'])
df2.iloc[0:]

Unnamed: 0,Identity,Name,Domain,Experience
0,1,CodeCollab,Tech,1
1,2,Invictus,Robotics,1
2,3,Zerodha,"Tech, Finance",3
3,4,ABCDEF,Agriculture,1
4,5,Qris,Tech,1
5,6,Papercoin,"Tech, Finance",2
6,7,Waterloss,Agriculture,4
7,8,Roblox,"Tech, Robotics",4


In [84]:
df = df2.append(df1,ignore_index = True)
df.head(5)

  df = df2.append(df1,ignore_index = True)


Unnamed: 0,Identity,Name,Domain,Experience
0,1,CodeCollab,Tech,1
1,2,Invictus,Robotics,1
2,3,Zerodha,"Tech, Finance",3
3,4,ABCDEF,Agriculture,1
4,5,Qris,Tech,1


In [85]:
#create a list of important columns to keep
features = ['Domain','Experience']
mentor[features].head(3)

Unnamed: 0,Domain,Experience
0,Tech,3
1,Tech,5
2,Finance,4


In [86]:
#clean and process the data
for feature in features:
 df[feature] = df[feature].fillna('')  #fill any missing value with the empty string

In [87]:
#create a function to combine the values of the important columns
def combine_features(row):
    return row['Domain']+" "+str(row['Experience'])

In [88]:
#apply the function to each row in the data set to store the combined strings into a new column called combined_features
df['combined_features'] = df.apply(combine_features, axis = 1)

In [89]:
df['combined_features'].head(10)

0              Tech 1
1          Robotics 1
2     Tech, Finance 3
3       Agriculture 1
4              Tech 1
5     Tech, Finance 2
6       Agriculture 4
7    Tech, Robotics 4
8              Tech 3
9              Tech 5
Name: combined_features, dtype: object

In [None]:
!pip install normalise

Collecting normalise
  Downloading normalise-0.1.8-py3-none-any.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m78.6 MB/s[0m eta [36m0:00:00[0m
Collecting roman (from normalise)
  Downloading roman-4.1-py3-none-any.whl (5.5 kB)
Installing collected packages: roman, normalise
Successfully installed normalise-0.1.8 roman-4.1


In [None]:
!pip install nltk



In [67]:
import numpy as np
import multiprocessing as mp
import nltk
nltk.download('brown')
nltk.download('names')
import string
import spacy
import en_core_web_sm
from nltk.tokenize import word_tokenize
from sklearn.base import TransformerMixin, BaseEstimator

# from normalise import normalise

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!


In [90]:
nlp = en_core_web_sm.load()


class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 variety="BrE",
                 user_abbrevs={},
                 n_jobs=1):
        """
        Text preprocessing transformer includes steps:
            1. Text normalization
            2. Punctuation removal
            3. Stop words removal
            4. Lemmatization

        variety - format of date (AmE - american type, BrE - british format)
        user_abbrevs - dict of user abbreviations mappings (from normalise package)
        n_jobs - parallel jobs to run
        """
        self.variety = variety
        self.user_abbrevs = user_abbrevs
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()

        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()

        return data

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        normalized_text = self._normalize(text)
        doc = nlp(normalized_text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)

    def _normalize(self, text):
        # some issues in normalise package
        try:
            return ' '.join(normalise(text, variety=self.variety, user_abbrevs=self.user_abbrevs, verbose=False))
        except:
            return text

    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if not t.is_stop]

    def _lemmatize(self, doc):
        return ' '.join([t.lemma_ for t in doc])

In [91]:
%%time
text = TextPreprocessor(n_jobs=-1).transform(df['combined_features'])
#df['combined_features'].head(10)

CPU times: user 9.18 ms, sys: 41 ms, total: 50.1 ms
Wall time: 172 ms


In [92]:
text = TextPreprocessor(n_jobs=-1).transform(df['combined_features'])

In [93]:
df['combined_features'].head(18)

0               Tech 1
1           Robotics 1
2      Tech, Finance 3
3        Agriculture 1
4               Tech 1
5      Tech, Finance 2
6        Agriculture 4
7     Tech, Robotics 4
8               Tech 3
9               Tech 5
10           Finance 4
11       Agriculture 2
12         Education 5
13          Robotics 3
14         Education 2
15              Tech 4
16           Finance 3
17          Robotics 3
Name: combined_features, dtype: object

In [94]:
#convert a collection of text to a matrix of token counts
count_matrix = CountVectorizer().fit_transform(df['combined_features'])
count_matrix

<18x5 sparse matrix of type '<class 'numpy.int64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [95]:
#get the cosine similarity matrix from the count matrix
cosine_sim = cosine_similarity(count_matrix)
print(cosine_sim)

[[1.         0.         0.70710678 0.         1.         0.70710678
  0.         0.70710678 1.         1.         0.         0.
  0.         0.         0.         1.         0.         0.        ]
 [0.         1.         0.         0.         0.         0.
  0.         0.70710678 0.         0.         0.         0.
  0.         1.         0.         0.         0.         1.        ]
 [0.70710678 0.         1.         0.         0.70710678 1.
  0.         0.5        0.70710678 0.70710678 0.70710678 0.
  0.         0.         0.         0.70710678 0.70710678 0.        ]
 [0.         0.         0.         1.         0.         0.
  1.         0.         0.         0.         0.         1.
  0.         0.         0.         0.         0.         0.        ]
 [1.         0.         0.70710678 0.         1.         0.70710678
  0.         0.70710678 1.         1.         0.         0.
  0.         0.         0.         1.         0.         0.        ]
 [0.70710678 0.         1.         0.  

In [96]:
#get the number of rows and columns in cosine_sim
cosine_sim.shape

(18, 18)

In [97]:
#helper function to get the name from the id
def name_identity(identity):
    return df[df.Identity == identity]['Name'].values[0]

In [98]:
#helper function to get the id from the name
def identity_name(name):
    return df[df.Name == name]['Identity'].values[0]

In [99]:
#the name of the user
user = df.loc[0,"Name"]
user

'CodeCollab'

In [100]:
#Find that name id
name_id = identity_name(user)
name_id

1

In [101]:
id_name = name_identity(3)
id_name

'Zerodha'

In [102]:
#enumerate through all the similarity scores of the 'user' to make a tuple of id and similarity index
similar_mentors = list( enumerate(cosine_sim[name_id]) )

In [103]:
similar_mentors

[(0, 0.0),
 (1, 1.0),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.7071067811865475),
 (8, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 1.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 1.0)]

In [104]:
#sort the list of similar mentors in the order of similarity scores in descending order
sorted_similar_mentors = sorted(similar_mentors, key = lambda x:x[1], reverse = True)[2:]

In [105]:
#print
sorted_similar_mentors

[(17, 1.0),
 (7, 0.7071067811865475),
 (0, 0.0),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0)]

In [128]:
i = 0
print ('The top 5 similar mentor to '+user+' are:')
for element in sorted_similar_mentors:
    print(element[0])
    i = i + 1
    if i >= 5:
        break


The top 5 similar mentor to CodeCollab are:
17
7
0
2
3


In [129]:
identities =[x[0] for x in sorted_similar_mentors[:5]]
identities

[17, 7, 0, 2, 3]

In [124]:
x=identities

In [125]:
result={"Name":[],"Identity":[]}
for i in x :
    result['Name'].append(df.loc[i,"Name"])
    result['Identity'].append(df.loc[i,"Identity"])

In [126]:
import pandas as pd
result=pd.DataFrame(result)

In [None]:
result

In [None]:
match = pd.DataFrame(data=result)
match.to_csv(f"./Data/match.csv")