## Social-Media-Users-Dataset
### The Ultimate Dummy Users Dataset for Social Media Recommendation Models

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

## Loading the data

In [2]:
df = pd.read_csv('SocialMediaUsersDataset.csv')
df.head()

Unnamed: 0,UserID,Name,Gender,DOB,Interests,City,Country
0,1,Jesse Lawhorn,Female,1958-10-15,"'Movies', 'Fashion', 'Fashion', 'Books'",Sibolga,Indonesia
1,2,Stacy Payne,Female,2004-07-21,"'Gaming', 'Finance and investments', 'Outdoor ...",Al Abyār,Libya
2,3,Katrina Nicewander,Female,2000-02-07,"'DIY and crafts', 'Music', 'Science', 'Fashion'",Wādī as Sīr,Jordan
3,4,Eric Yarbrough,Male,1985-04-14,"'Outdoor activities', 'Cars and automobiles'",Matera,Italy
4,5,Daniel Adkins,Female,1955-09-18,"'Politics', 'History'",Biruaca,Venezuela


### selecting a subset of the data

In [3]:
df = df[:5000]

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   UserID     5000 non-null   int64 
 1   Name       5000 non-null   object
 2   Gender     5000 non-null   object
 3   DOB        5000 non-null   object
 4   Interests  5000 non-null   object
 5   City       5000 non-null   object
 6   Country    5000 non-null   object
dtypes: int64(1), object(6)
memory usage: 273.6+ KB


In [5]:
df.duplicated().sum()

0

In [6]:
df.columns = df.columns.str.lower()
df = df[['userid','name','gender','interests','city','country']]

In [7]:
df.head()

Unnamed: 0,userid,name,gender,interests,city,country
0,1,Jesse Lawhorn,Female,"'Movies', 'Fashion', 'Fashion', 'Books'",Sibolga,Indonesia
1,2,Stacy Payne,Female,"'Gaming', 'Finance and investments', 'Outdoor ...",Al Abyār,Libya
2,3,Katrina Nicewander,Female,"'DIY and crafts', 'Music', 'Science', 'Fashion'",Wādī as Sīr,Jordan
3,4,Eric Yarbrough,Male,"'Outdoor activities', 'Cars and automobiles'",Matera,Italy
4,5,Daniel Adkins,Female,"'Politics', 'History'",Biruaca,Venezuela


### concatenating the info. of persons into one column so as to perform text preprocessing

In [8]:
df.gender = df.gender.apply(lambda x: x.split())
df.interests = df.interests.apply(lambda x: x.split())
df.city = df.city.apply(lambda x: x.split())
df.country = df.country.apply(lambda x: x.split())

In [9]:
df['details'] = df['gender'] + df['interests'] + df['city'] + df['country']
df.head()

Unnamed: 0,userid,name,gender,interests,city,country,details
0,1,Jesse Lawhorn,[Female],"['Movies',, 'Fashion',, 'Fashion',, 'Books']",[Sibolga],[Indonesia],"[Female, 'Movies',, 'Fashion',, 'Fashion',, 'B..."
1,2,Stacy Payne,[Female],"['Gaming',, 'Finance, and, investments',, 'Out...","[Al, Abyār]",[Libya],"[Female, 'Gaming',, 'Finance, and, investments..."
2,3,Katrina Nicewander,[Female],"['DIY, and, crafts',, 'Music',, 'Science',, 'F...","[Wādī, as, Sīr]",[Jordan],"[Female, 'DIY, and, crafts',, 'Music',, 'Scien..."
3,4,Eric Yarbrough,[Male],"['Outdoor, activities',, 'Cars, and, automobil...",[Matera],[Italy],"[Male, 'Outdoor, activities',, 'Cars, and, aut..."
4,5,Daniel Adkins,[Female],"['Politics',, 'History']",[Biruaca],[Venezuela],"[Female, 'Politics',, 'History', Biruaca, Vene..."


In [10]:
df.drop(['gender','interests','city','country'],axis=1,inplace=True)
df.head()

Unnamed: 0,userid,name,details
0,1,Jesse Lawhorn,"[Female, 'Movies',, 'Fashion',, 'Fashion',, 'B..."
1,2,Stacy Payne,"[Female, 'Gaming',, 'Finance, and, investments..."
2,3,Katrina Nicewander,"[Female, 'DIY, and, crafts',, 'Music',, 'Scien..."
3,4,Eric Yarbrough,"[Male, 'Outdoor, activities',, 'Cars, and, aut..."
4,5,Daniel Adkins,"[Female, 'Politics',, 'History', Biruaca, Vene..."


In [11]:
df['details'] = df['details'].apply(lambda x:" ".join(x))

### performing text preprocessing on the user's info

In [12]:
import re
def preprocess_text(sent):
    tag_pattern = re.compile(r'<.*?>')
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    # Removing Punctuation & Special Characters
    sent = re.sub('[^a-zA-Z]',' ',sent)
    # Removal of HTML Tags
    sent = re.sub(tag_pattern, '', sent)
    # lowercasing
    sent = sent.lower()
    # removing single character
    sent = re.sub(r"\s+[a-zA-Z]\s+",' ',sent)
    # removing multiple spaces
    sent = re.sub(r'\s+',' ',sent)
    # Removal of URLs
    sent = re.sub(url_pattern,'',sent)
    return sent

In [13]:
df['details'] = df['details'].apply(preprocess_text)

In [14]:
df.head()

Unnamed: 0,userid,name,details
0,1,Jesse Lawhorn,female movies fashion fashion books sibolga in...
1,2,Stacy Payne,female gaming finance and investments outdoor ...
2,3,Katrina Nicewander,female diy and crafts music science fashion d ...
3,4,Eric Yarbrough,male outdoor activities cars and automobiles m...
4,5,Daniel Adkins,female politics history biruaca venezuela


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel,cosine_similarity
from scipy.sparse import csr_matrix

### creating numerical representation of the users' info

In [16]:
#creating a matrix
vect = TfidfVectorizer()
matrix = vect.fit_transform(df['details'])

In [17]:
matrix.shape

(5000, 5285)

In [18]:
matrix

<5000x5285 sparse matrix of type '<class 'numpy.float64'>'
	with 40214 stored elements in Compressed Sparse Row format>

### creating cosine similarity of the matrix

In [19]:
cosine = cosine_similarity(matrix)

In [20]:
cosine.shape

(5000, 5000)

In [21]:
cosine

array([[1.        , 0.01484567, 0.1082227 , ..., 0.13064241, 0.        ,
        0.01615376],
       [0.01484567, 1.        , 0.02583084, ..., 0.01732188, 0.06967475,
        0.0461512 ],
       [0.1082227 , 0.02583084, 1.        , ..., 0.0731007 , 0.        ,
        0.18056014],
       ...,
       [0.13064241, 0.01732188, 0.0731007 , ..., 1.        , 0.        ,
        0.01884816],
       [0.        , 0.06967475, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.01615376, 0.0461512 , 0.18056014, ..., 0.01884816, 0.        ,
        1.        ]])

In [22]:
crs = csr_matrix(matrix)

### performing recomendation

In [23]:
#making recommendation
def reccomend(new_person):
    # obtaining the index of the new person from the dataframe
    ind = df[df['name'] == new_person].index[0]
    #'cosine[ind]' obtains the similarity for the 'new person','list(enumerate())'  creates a list of an 
    # iterable that produces pairs of (index, value),'sorted' sorts the list, 'reverse=True' sorts
    # the list in descending order rather than default ascending order,'key = lambda x:x[1]' specifies that 
    # list should be sorted using the similarity not its index i.e [(3,0.98),(2,0.67),(1,0.45)]
    distance = sorted(list(enumerate(cosine[ind])),reverse = True,key = lambda x: x[1])
    for i in distance[1:6]:
        print(df['name'].iloc[i[0]])

In [24]:
reccomend('Raymond Brodersen')

Katrina Nicewander
Beverly Harris
James Sass
Noel Hardwick
Matthew Davis


### saving objects

In [25]:
import pickle
pickle.dump(df,open('facebook.pkl','wb'))
pickle.dump(cosine,open('facebooksim.pkl','wb'))