## importing necessary Libraries and dataset

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
import requests

In [4]:
influencer_df=pd.read_csv("influencers.csv") # Content Data

In [5]:
influencer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   S.no           1000 non-null   int64 
 1   username       1000 non-null   object
 2   followers      1000 non-null   object
 3   Country        996 non-null    object
 4   viewers        1000 non-null   object
 5   Average views  1000 non-null   object
 6   Categories     1000 non-null   object
dtypes: int64(1), object(6)
memory usage: 54.8+ KB


In [6]:
influencer_df.head()

Unnamed: 0,S.no,username,followers,Country,viewers,Average views,Categories
0,1,cristiano,477.9M,India,5M,6.2M,"'Parenting and family', 'Art', 'History'"
1,2,kyliejenner,368.1M,United States,3.5M,5.5M,"'Finance and investments', 'Travel', 'Pets'"
2,3,arianagrande,329.6M,United States,2.9M,4M,"'Science', 'Movies', 'Beauty'"
3,4,leomessi,358.6M,Indonesia,2.7M,3.5M,"'Cars and automobiles', 'Beauty', 'Health and ..."
4,5,zendaya,151.1M,United States,4.3M,5.8M,"'Parenting and family', 'Beauty', 'Art'"


## Data Preprocessing for influencer data

**Dealing with duplicate data**

In [7]:
print(f'{influencer_df.duplicated().sum()} duplicates detected')

0 duplicates detected


**Encoding text labels**

In [8]:
influencer_df["Categories"]=influencer_df["Categories"].apply(lambda x: x.split(", "))

In [9]:
mlb = MultiLabelBinarizer()
content_cats = pd.DataFrame(mlb.fit_transform(influencer_df['Categories']), 
                      columns=[col.replace("'", "").replace('"', '') for col in mlb.classes_]
                     )

In [10]:
influencer_df.drop(columns=['Categories','S.no'], inplace=True)

**Encoding engagement data**

In [11]:
def unit_stdzer(value):
    if value.endswith('K'):
        return float(value[:-1])
    elif value.endswith('M'):
        return float(value[:-1]) * 1000
    else:
        return float(value)

In [12]:
influencer_df.rename(columns={'followers':'followers(in K)', 'Average views':'avg views(in K)' ,'viewers':'viewers(in K)'},inplace=True)

In [13]:
num_cols=['followers(in K)', 'avg views(in K)','viewers(in K)']
for col in num_cols:
    influencer_df[col] = influencer_df[col].apply(unit_stdzer)

In [14]:
scaler = StandardScaler()
influencer_df[num_cols] = scaler.fit_transform(influencer_df[num_cols])

**Encoding demographic data**

In [15]:
# Treating null values
influencer_df['Country']=influencer_df['Country'].replace({None:influencer_df['Country'].mode()[0]})

Apply the function to the influencer data


In [16]:
influencer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   username         1000 non-null   object 
 1   followers(in K)  1000 non-null   float64
 2   Country          1000 non-null   object 
 3   viewers(in K)    1000 non-null   float64
 4   avg views(in K)  1000 non-null   float64
dtypes: float64(3), object(2)
memory usage: 39.2+ KB


In [17]:
def get_lat_long(country, geo_cache):
    if country in geo_cache:
        return geo_cache[country]
    
    api_key="aa7564528057489eb30768cf163e7012" # your opencage api key
    url = f"https://api.opencagedata.com/geocode/v1/json?q={country}&key={api_key}"
    
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        data = response.json()
        
        if data['results']:
            lat = int(data['results'][0]['geometry']['lat'])
            long = int(data['results'][0]['geometry']['lng'])
            geo_cache[country] = (lat, long)  # Cache the result for future use
            return lat, long
        else:
            return None, None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching lat-long for {country}: {e}")
        return None, None

In [18]:
influencer_lat = []
influencer_long = []
geo_cache={}
for index, row in influencer_df.iterrows():
    lat, long = get_lat_long(row['Country'],geo_cache)
    influencer_lat.append(lat)
    influencer_long.append(long)

influencer_df['latitude'] = influencer_lat
influencer_df['longitude'] = influencer_long

In [19]:
influencer_df.drop(columns=["Country"],inplace=True)

In [20]:
influencer_df.head()

Unnamed: 0,username,followers(in K),viewers(in K),avg views(in K),latitude,longitude
0,cristiano,10.023948,4.612757,5.077109,22,78
1,kyliejenner,7.583637,3.051233,4.424813,39,-100
2,arianagrande,6.727972,2.426624,3.027035,39,-100
3,leomessi,7.372499,2.218421,2.56111,-2,117
4,zendaya,2.760799,3.884046,4.704368,39,-100


# Data Preprocessing users data

In [21]:
usersdf=pd.read_csv("SocialMediaUsers.csv") # User Data 

In [22]:
usersdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   UserID     100000 non-null  int64 
 1   Name       100000 non-null  object
 2   Gender     100000 non-null  object
 3   DOB        100000 non-null  object
 4   Interests  100000 non-null  object
 5   City       100000 non-null  object
 6   Country    100000 non-null  object
dtypes: int64(1), object(6)
memory usage: 5.3+ MB


In [23]:
usersdf['Interests']=usersdf['Interests'].apply(lambda x: x.split(", "))

In [24]:
usersdf.head()

Unnamed: 0,UserID,Name,Gender,DOB,Interests,City,Country
0,1,Jesse Lawhorn,Female,1958-10-15,"['Movies', 'Fashion', 'Fashion', 'Books']",Sibolga,Indonesia
1,2,Stacy Payne,Female,2004-07-21,"['Gaming', 'Finance and investments', 'Outdoor...",Al Abyār,Libya
2,3,Katrina Nicewander,Female,2000-02-07,"['DIY and crafts', 'Music', 'Science', 'Fashion']",Wādī as Sīr,Jordan
3,4,Eric Yarbrough,Male,1985-04-14,"['Outdoor activities', 'Cars and automobiles']",Matera,Italy
4,5,Daniel Adkins,Female,1955-09-18,"['Politics', 'History']",Biruaca,Venezuela


**Encoding User preference data**

In [25]:
user_preferences=pd.DataFrame(mlb.fit_transform(usersdf["Interests"]),
                    columns=[col.replace("'", "").replace('"', '') for col in mlb.classes_])
usersdf.drop(columns=['Interests'], inplace=True)

In [26]:
user_preferences

Unnamed: 0,Art,Beauty,Books,Business and entrepreneurship,Cars and automobiles,Cooking,DIY and crafts,Education and learning,Fashion,Finance and investments,...,Outdoor activities,Parenting and family,Pets,Photography,Politics,Science,Social causes and activism,Sports,Technology,Travel
0,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99996,0,0,0,0,1,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
99997,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99998,0,0,0,1,0,0,0,0,1,0,...,0,1,0,1,0,0,0,0,0,0


**Encoding Demographic Data of user**

In [27]:
usersdf['DOB'] = pd.to_datetime(usersdf['DOB'])
usersdf['DOB'] = usersdf['DOB'].apply(lambda x: datetime.now().year - x.year)
usersdf.rename(columns={'DOB':'Age'}, inplace=True)

converting country data to latitudes and longitudes

In [28]:
user_lat = []
user_long = []
geo_cache={}
for index, row in usersdf.iterrows():
    lat, long = get_lat_long(row['Country'],geo_cache)
    user_lat.append(lat)
    user_long.append(long)

usersdf['latitude'] = user_lat
usersdf['longitude'] = user_long

In [29]:
usersdf.drop(columns=["UserID","City","Country","Gender"],inplace=True)

In [30]:
usersdf.head(10)

Unnamed: 0,Name,Age,latitude,longitude
0,Jesse Lawhorn,67,-2,117
1,Stacy Payne,21,26,18
2,Katrina Nicewander,25,31,36
3,Eric Yarbrough,40,42,12
4,Daniel Adkins,70,8,-66
5,Diane Jara,58,39,-100
6,Sheryl Morgan,56,54,-3
7,William Harper,60,14,29
8,Virginia Varron,41,25,42
9,Charles Figueroa,22,22,78


## Generating Weights

computing distances between locations of users and influencers navigating on a map using euclidean instead of haversine or geodesics to reduce computational overhead

In [31]:
def euclidean_dist_matrix(user_locs, influencer_locs):
    diff = user_locs[:, np.newaxis, :] - influencer_locs[np.newaxis, :, :]
    dist_matrix = np.sqrt(np.sum(diff ** 2, axis=-1))
    with np.errstate(divide='ignore', invalid='ignore'):
        preference_matrix = np.where(dist_matrix != 0, 1.0 / dist_matrix, 0.0)
    return preference_matrix

Normalize the weights

In [32]:
user_locs = np.array(usersdf[['latitude', 'longitude']].to_numpy())
influencer_locs = np.array(influencer_df[['latitude', 'longitude']].to_numpy())

In [33]:
dist_matrix = euclidean_dist_matrix(user_locs, influencer_locs)

In [34]:
dist_matrix

array([[0.02183739, 0.00452818, 0.00452818, ..., 0.01235227, 0.02544933,
        0.        ],
       [0.01662975, 0.00842361, 0.00842361, ..., 0.03713907, 0.00913594,
        0.00971974],
       [0.02328101, 0.00734025, 0.00734025, ..., 0.12126781, 0.01097246,
        0.01143324],
       ...,
       [0.00632329, 0.01319794, 0.01319794, ..., 0.00759825, 0.00478403,
        0.00525697],
       [0.00741555, 0.0147282 , 0.0147282 , ..., 0.00942474, 0.00538257,
        0.00587585],
       [0.01510822, 0.00835366, 0.00835366, ..., 0.03184649, 0.00915929,
        0.00893713]])

converting engagement data to numpy array

In [35]:
engagement_data = influencer_df[num_cols].to_numpy()
engagement_scores = np.mean(engagement_data, axis=1)
engagement_scores = engagement_scores.reshape(-1, 1)

computing similarity between each user with each influencer using Cosine similarity
$$
\text{cosine similarity} = \cos(\theta) = \frac{\mathbf{A} \cdot \mathbf{B}}{\|\mathbf{A}\| \|\mathbf{B}\|}
$$

In [36]:
cos_sim = cosine_similarity(user_preferences, content_cats)

In [37]:
combined_scores = dist_matrix.dot(engagement_scores) + cos_sim

In [38]:
combined_scores

array([[ 0.22058993,  0.22058993,  0.55392327, ...,  0.22058993,
         0.22058993,  0.55392327],
       [-1.03020996, -0.45285969, -1.03020996, ..., -0.74153483,
        -0.74153483, -0.74153483],
       [-1.13704802, -1.13704802, -0.84837288, ..., -1.13704802,
        -1.13704802, -1.13704802],
       ...,
       [-1.01513837, -1.01513837, -1.01513837, ..., -0.60689008,
        -1.01513837, -1.01513837],
       [ 0.01633842, -0.24186047, -0.24186047, ...,  0.01633842,
        -0.24186047, -0.24186047],
       [-1.18563174, -0.77738345, -1.18563174, ..., -1.18563174,
        -1.18563174, -1.18563174]])

## Generating Recommendations

In [39]:
user_index_map = {name: idx for idx, name in enumerate(usersdf['Name'])} #hash map for faster index lookups
user_name = "Charles Figueroa" 
user_idx = user_index_map.get(user_name, None)

In [40]:
if user_idx is not None:
    top_n = 10
    influencer_indices = np.argpartition(combined_scores[user_idx], -top_n)[-top_n:]  
    influencer_indices = influencer_indices[np.argsort(combined_scores[user_idx][influencer_indices])[::-1]] 
    top_influencers = influencer_df.iloc[influencer_indices]

    recommended_influencers = list(top_influencers['username'])
    print(f"Recommended influencers for {user_name}:")  
    for i, influencer in enumerate(recommended_influencers, start=1):
        print(f"{i}: {influencer}")  
else:
    print(f"User '{user_name}' not found in the dataset.")

Recommended influencers for Charles Figueroa:
1: antony00
2: everyone_woo
3: pooorblack
4: twentyonepilots
5: jeonghaniyoo_n
6: sunnyleone
7: skuukzky
8: badbunnypr
9: ridwankamil
10: l7nnon
