In [157]:
import numpy as np
import pandas as pd

In [158]:
profiles = pd.read_csv('Profile Complete.csv')

In [159]:
profiles.shape

(613, 12)

In [160]:
profiles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 613 entries, 0 to 612
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Confirmation Number      613 non-null    int64  
 1   Services Offered         262 non-null    object 
 2   Target Entity            250 non-null    object 
 3   Interest Areas           0 non-null      float64
 4   Investors                32 non-null     object 
 5   Hiring Requirement       239 non-null    object 
 6   GeoLocation              605 non-null    object 
 7   Job Function             605 non-null    object 
 8   Investment Range         19 non-null     object 
 9   Investment Requirement   165 non-null    object 
 10  Services Required        316 non-null    object 
 11  Companies Raising Money  341 non-null    object 
dtypes: float64(1), int64(1), object(10)
memory usage: 57.6+ KB


In [161]:
profiles.describe()

Unnamed: 0,Confirmation Number,Interest Areas
count,613.0,0.0
mean,105190300.0,
std,2847814.0,
min,100003700.0,
25%,102708700.0,
50%,105460600.0,
75%,107587200.0,
max,109996400.0,


In [162]:
profiles.head()

Unnamed: 0,Confirmation Number,Services Offered,Target Entity,Interest Areas,Investors,Hiring Requirement,GeoLocation,Job Function,Investment Range,Investment Requirement,Services Required,Companies Raising Money
0,107076259,,,,,"Supply Chain, Operations & Fulfillment\nResear...",North America\nAfrica\nIndia\nMiddle East,Other\nMerchandising,,More than $10 million,Event/Experiential Services\nAugmented Reality...,True
1,102671127,,"Established Retailer or Brand\nInvestor (VC, P...",,True,,Asia - Other\nChina\nUK & Ireland\nNordics\nIn...,"Data, Analytics & Insights\nCustomer Experienc...",$1 million+ to $5 million\nMore than $10 milli...,,,
2,103470157,,,,,"Research, Content & Journalism\nCustomer Exper...",France & Benelux\nChina\nAfrica\nNordics,Technology & IT,,,,False
3,100663726,Financing Solutions\nCrypto Payments Solutions...,,,,,Africa\nCentral & South America\nIndia\nAustra...,"Strategy, Innovation & Transformation\nSupply ...",,More than $10 million,Customer Feedback Solutions\nRetargeting Solut...,True
4,109243522,,Industry Association or Nonprofit\nInvestor (V...,,,,Southern Europe\nOther Europe\nNordics\nIndia\...,"Environmental, Social & Corporate Governance (...",,,,


In [163]:
profiles.isnull().sum()

Confirmation Number          0
Services Offered           351
Target Entity              363
Interest Areas             613
Investors                  581
Hiring Requirement         374
GeoLocation                  8
Job Function                 8
Investment Range           594
Investment Requirement     448
Services Required          297
Companies Raising Money    272
dtype: int64

Data Cleaning

In [164]:
# Replace spaces with commas in the desired column(s)
profiles['Services Offered'] = profiles['Services Offered'].str.replace('\n', ',')
profiles['Target Entity'] = profiles['Target Entity'].str.replace('\n', ',')
profiles['GeoLocation'] = profiles['GeoLocation'].str.replace('\n', ',')
profiles['Hiring Requirement'] = profiles['Hiring Requirement'].str.replace('\n', ',')
profiles['Services Required'] = profiles['Services Required'].str.replace('\n', ',')
profiles['Job Function'] = profiles['Job Function'].str.replace('\n', ',')
# Save the modified DataFrame to a new CSV file
#profiles.to_csv('modified_profile.csv', index=False)

In [165]:
# profiles['Services Offered'] = profiles['Services Offered'].astype('object')
# profiles['GeoLocation'] = profiles['GeoLocation'].astype('object')
# profiles['Services Required'] = profiles['Services Required'].astype('object')
profiles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 613 entries, 0 to 612
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Confirmation Number      613 non-null    int64  
 1   Services Offered         262 non-null    object 
 2   Target Entity            250 non-null    object 
 3   Interest Areas           0 non-null      float64
 4   Investors                32 non-null     object 
 5   Hiring Requirement       239 non-null    object 
 6   GeoLocation              605 non-null    object 
 7   Job Function             605 non-null    object 
 8   Investment Range         19 non-null     object 
 9   Investment Requirement   165 non-null    object 
 10  Services Required        316 non-null    object 
 11  Companies Raising Money  341 non-null    object 
dtypes: float64(1), int64(1), object(10)
memory usage: 57.6+ KB


In [166]:
targets=["Services Offered","GeoLocation","Services Required"]

# Define the logic to create the 'Important Features' column

In [167]:
def get_important_features(row):
    if row['Companies Raising Money']:
        # Condition 1: Companies Raising Money is True
        features = [row['Services Offered'], row['GeoLocation']]
    elif row['Investors']:
        # Condition 2: Investors is True
        features = [row['Services Required'], row['GeoLocation']]
    elif pd.isnull(row['Services Offered']) and not pd.isnull(row['Services Required']):
        # Condition 3: Services Offered is empty, Services Required has values
        features = [row['Services Required'], row['GeoLocation']]
    elif pd.isnull(row['Services Required']) and not pd.isnull(row['Services Offered']):
        # Condition 4: Services Required is empty, Services Offered has values
        features = [row['Services Offered'], row['GeoLocation']]
    elif pd.isnull(row['Investors']):
        # Condition 5: Investors is empty, Services Offered is empty, Services Required has values
        features = [row['Services Required'], row['GeoLocation']]
    elif pd.isnull(row['Companies Raising Money']):
        # Condition 6: Companies Raising Money is empty, Services Required is empty, Services Offered has values
        features = [row['Services Offered'], row['GeoLocation']]
    else:
        # None of the conditions are met
        features = []

    # Remove empty and NaN values from the features list
    features = [f for f in features if pd.notnull(f) and f]

    # Join the features with commas and return the result
    return ', '.join(features)

# Apply the logic to create the 'Important Features' column

In [168]:
profiles['Important Features'] = profiles.apply(get_important_features, axis=1)

# Save the modified DataFrame to a new CSV file
profiles.to_csv('imp_modified_file.csv', index=False)

In [169]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 

In [170]:
vectorizer=CountVectorizer()
profilesVector=vectorizer.fit_transform(profiles["Important Features"])
profilesVector

<613x268 sparse matrix of type '<class 'numpy.int64'>'
	with 12464 stored elements in Compressed Sparse Row format>

In [171]:
similarity=cosine_similarity(profilesVector)
similarity

array([[1.        , 0.39605902, 0.18257419, ..., 0.49746834, 0.30996521,
        0.        ],
       [0.39605902, 1.        , 0.32539569, ..., 0.7599606 , 0.47352028,
        0.        ],
       [0.18257419, 0.32539569, 1.        , ..., 0.38924947, 0.1940285 ,
        0.        ],
       ...,
       [0.49746834, 0.7599606 , 0.38924947, ..., 1.        , 0.64196668,
        0.        ],
       [0.30996521, 0.47352028, 0.1940285 , ..., 0.64196668, 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [172]:
# Set the desired column as the index
# profiles = profiles.set_index('Confirmation Number')
profiles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 613 entries, 0 to 612
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Confirmation Number      613 non-null    int64  
 1   Services Offered         262 non-null    object 
 2   Target Entity            250 non-null    object 
 3   Interest Areas           0 non-null      float64
 4   Investors                32 non-null     object 
 5   Hiring Requirement       239 non-null    object 
 6   GeoLocation              605 non-null    object 
 7   Job Function             605 non-null    object 
 8   Investment Range         19 non-null     object 
 9   Investment Requirement   165 non-null    object 
 10  Services Required        316 non-null    object 
 11  Companies Raising Money  341 non-null    object 
 12  Important Features       613 non-null    object 
dtypes: float64(1), int64(1), object(11)
memory usage: 62.4+ KB


In [173]:
conf_no = 100663726
index=profiles[profiles["Confirmation Number"]==conf_no].index[0]
index

3

In [174]:
similar_profiles=list(enumerate(similarity[index]))
similar_profiles=sorted(similar_profiles,key=lambda x:x[1],reverse=True)
similar_profiles[:]

[(3, 0.9999999999999999),
 (384, 0.816871722955756),
 (340, 0.8164965809277258),
 (11, 0.8164965809277257),
 (522, 0.7973815320737668),
 (466, 0.7740702698132101),
 (439, 0.7693218186208297),
 (609, 0.7693218186208297),
 (610, 0.7693218186208297),
 (146, 0.7673643458069224),
 (494, 0.7620007620011432),
 (273, 0.757772228311384),
 (260, 0.757614408414158),
 (127, 0.7499999999999999),
 (467, 0.7499999999999999),
 (181, 0.7484551991837487),
 (303, 0.7470178808339961),
 (190, 0.7462025072446363),
 (21, 0.7454128010052141),
 (252, 0.7426106572325054),
 (472, 0.7385489458759966),
 (483, 0.7372097807744858),
 (270, 0.7364853795464745),
 (6, 0.7359800721939873),
 (23, 0.7359800721939873),
 (46, 0.7359800721939873),
 (54, 0.7359800721939873),
 (159, 0.7359800721939873),
 (208, 0.7359800721939873),
 (246, 0.7359800721939873),
 (264, 0.7359800721939873),
 (322, 0.7359800721939873),
 (477, 0.7359800721939873),
 (510, 0.7359800721939873),
 (539, 0.7359800721939873),
 (572, 0.7359800721939873),
 (57

In [175]:
def prGreen(skk): print("\033[92m {}\033[00m" .format(skk))
def prRed(skk): print("\033[91m {}\033[00m" .format(skk))
def prYellow(skk): print("\033[93m {}\033[00m" .format(skk))

In [176]:
profiles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 613 entries, 0 to 612
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Confirmation Number      613 non-null    int64  
 1   Services Offered         262 non-null    object 
 2   Target Entity            250 non-null    object 
 3   Interest Areas           0 non-null      float64
 4   Investors                32 non-null     object 
 5   Hiring Requirement       239 non-null    object 
 6   GeoLocation              605 non-null    object 
 7   Job Function             605 non-null    object 
 8   Investment Range         19 non-null     object 
 9   Investment Requirement   165 non-null    object 
 10  Services Required        316 non-null    object 
 11  Companies Raising Money  341 non-null    object 
 12  Important Features       613 non-null    object 
dtypes: float64(1), int64(1), object(11)
memory usage: 62.4+ KB


In [177]:
r_profiles=[]

for i in range(len(similar_profiles)):
    profile_index = similar_profiles[i][0]
    profile = profiles.loc[profile_index]
    
    if profiles.loc[index, 'Companies Raising Money']:
        # Condition 1: Companies Raising Money is True
        if not profile['Companies Raising Money']:
            r_profiles.append(profile['Confirmation Number'])
    elif profiles.loc[index, 'Investors']:
        # Condition 2: Investor is True
        if not profile['Investors']:
            r_profiles.append(profile['Confirmation Number'])
    elif pd.isnull(profiles.loc[index, 'Companies Raising Money']) and pd.isnull(profiles.loc[index, 'Investors']):
        # Condition 3: Both Companies Raising Money and Investor are empty
        r_profiles.append(profile['Confirmation Number'])

# for i in range(len(similar_profiles)):
#     profile_index = similar_profiles[i][0]
#     profile = profiles.loc[profile_index]
#     r_profiles.append(profile['Confirmation Number'])

r_profiles = r_profiles[1:]

In [179]:
prYellow(f"Recommended Profiles for confirmation {conf_no}, Details  :")
print("Services Offered",end='->')
prRed(profiles.loc[profiles['Confirmation Number'] == conf_no, 'Services Offered'].values)
print("Services Required",end='->')
prYellow(profiles.loc[profiles['Confirmation Number'] == conf_no, 'Services Required'].values)
print("GeoLocation",end='->')
prYellow(profiles.loc[profiles['Confirmation Number'] == conf_no, 'GeoLocation'].values)
print("Investor",end='->')
prYellow(profiles.loc[profiles['Confirmation Number'] == conf_no, 'Investors'].values)
print("Companies Raising Money",end='->')
prYellow(profiles.loc[profiles['Confirmation Number'] == conf_no, 'Companies Raising Money'].values)
print("\n")
for profile in r_profiles:
    prGreen('Confirmation No --> ')
    prGreen(profile)
    print("Services Offered",end='->')
    prRed(profiles.loc[profiles['Confirmation Number'] == profile, 'Services Offered'].values)
    print("Services Required",end='->')
    prYellow(profiles.loc[profiles['Confirmation Number'] == profile, 'Services Required'].values)
    print("GeoLocation",end='->')
    prYellow(profiles.loc[profiles['Confirmation Number'] == profile, 'GeoLocation'].values)
    print("Investor",end='->')
    prYellow(profiles.loc[profiles['Confirmation Number'] == profile, 'Investors'].values)
    print("Companies Raising Money",end='->')
    prYellow(profiles.loc[profiles['Confirmation Number'] == profile, 'Companies Raising Money'].values)

[93m Recommended Profiles for confirmation 100663726, Details  :[00m
Services Offered->[91m ['Financing Solutions,Crypto Payments Solutions,Data Warehousing'][00m
Services Required->[93m ['Customer Feedback Solutions,Retargeting Solutions,Sustainability Solutions,Content Management Systems'][00m
GeoLocation->[93m ['Africa,Central & South America,India,Australia & New Zealand,Nordics,China,Middle East,Asia - Other,Other Europe,France & Benelux,Germany, Austria & Switzerland'][00m
Investor->[93m [nan][00m
Companies Raising Money->[93m [True][00m


[92m Confirmation No --> [00m
[92m 104091921[00m
Services Offered->[91m ['Cross-Border Ecommerce Platforms,Associate Mobility Solutions,NFTs & Tokens,Ecommerce Platforms,Reconciliation & Reporting Solutions,Network & Communications (inc. RFID & WiFi),Automated & Self Checkout Solutions'][00m
Services Required->[93m ['Packaging Solutions,Crypto Payments Solutions,Consumer Sentiment & Reviews,Sourcing Solutions & Services'][00