### Imports

In [None]:
! pip install pandas numpy
! pip install -U scikit-learn
! pip install imblearn
! pip install kaggle
! pip install streamlit
! pip install tqdm
! pip install streamlit_jupyter
! pip install streamlit-echarts
! pip install openpyxl

In [None]:
import pandas as pd
import json, glob
import numpy as np

## Instaloader api

In [None]:
from instaloader import Instaloader, Profile, ProfileNotExistsException, LoginRequiredException
L=Instaloader()
import time
from functools import lru_cache 
  
ERROR=''
@lru_cache(maxsize = 100) 
def get_info(username, timeout=0):
    global ERROR
    try:
        time.sleep(timeout)
        print("Scraping ...")
        profile = Profile.from_username(L.context, username=username)
        info = profile._asdict()
        return info
    except ProfileNotExistsException as e:
        print(e)
        ERROR="Profile doesn't exist"
    except LoginRequiredException as e:
        print(e)
        ERROR="Need to login to use instaloader, timeout from instaloader"   

### Feature enggineering

In [None]:
features =[
#     'full_name',
#     'username',
#      'id',
    
    'biography', #len
#  'blocked_by_viewer',
#  'restricted_by_viewer',
 # 'country_block',
#  'external_url',
#  'external_url_linkshimmed',
 'edge_followed_by', #count inside
 'fbid', #is none
#  'followed_by_viewer',
 'edge_follow',  #count inside
#  'follows_viewer',
 
 'has_ar_effects',
 'has_clips',
 'has_guides',
 'has_channel',
    
 # 'has_blocked_viewer',
 'highlight_reel_count',
 # 'has_requested_viewer',
 'is_business_account',
 # 'is_joined_recently',
#  'business_category_name',
#  'overall_category_name',
#  'category_enum',
#  'category_name',
 'is_private',
 'is_verified',
#  'edge_mutual_followed_by',
 'profile_pic_url', #if none
#  'profile_pic_url_hd',
#  'requested_by_viewer',
#  'should_show_category',
 # 'connected_fb_page', #if None or not
#  'edge_felix_video_timeline',
#  'edge_owner_to_timeline_media',
#  'edge_saved_media',
#  'edge_media_collections'
]

def process_entries(ent):
    ent=ent.copy()
    ent['biography']=len(ent['biography'])
    ent['fbid']=bool(ent['fbid'])
    ent['profile_pic_url']=bool(ent['profile_pic_url'])
    
    ent['edge_followed_by']=ent['edge_followed_by']['count']
    ent['edge_follow']=ent['edge_follow']['count']
    
#     ent.pop('full_name'), ent.pop('id'), ent.pop('username')
    return ent


## Collect spam users data
https://www.kaggle.com/datasets/rezaunderfit/instagram-fake-accounts-dataset

In [None]:
! mkdir -p dataset/fake/
! kaggle datasets download -d  rezaunderfit/instagram-fake-accounts-dataset --force --unzip -p dataset/fake/

fakedata=[json.load(open(f))['graphql']['user'] for f in glob.glob('dataset/fake/db/*.json')]

### Collect real users and scrape them
https://raw.githubusercontent.com/harshitkgupta/Fake-Profile-Detection-using-ML/master/data/users.csv

In [None]:
! mkdir -p dataset/real/ 
! wget -nc https://raw.githubusercontent.com/harshitkgupta/Fake-Profile-Detection-using-ML/master/data/users.csv -P dataset/real/  

In [None]:
import os
from tqdm import tqdm

realjson='dataset/real/real.json'
realcsv='dataset/real/users.csv'

if os.path.isfile(realjson):
    realdata = json.load(open(realjson))
else:
    realdata = []
    print('One time setup to be done, this may take a while...')
    
savedusers = [row['username'] for row in realdata]
    

MINDATA=50
MAXSCRAPE=0 # change this variable to get more data by scraping real users)
n_scrape = max(MAXSCRAPE, MINDATA-len(savedusers))
realdf = pd.read_csv(realcsv)

i=0
for realuser in tqdm(realdf["screen_name"].sample(frac=1).reset_index(drop=True)):
    if i >= n_scrape:
        break
    print(realuser)
    if realuser not in savedusers:
        ret=get_info(realuser)
        if ret:
            realdata.append(ret)
            json.dump(realdata, open(realjson, 'w+'))
            print(f'[{i}]: Scraped and saved')
            i+=1
        else:
            realdf=realdf[realdf["screen_name"] != realuser]
            realdf.to_csv(realcsv, index=False)
        
# json.dump(realdata, open(realjson, 'w+'))

In [None]:
fakedf = pd.DataFrame(map(process_entries, fakedata), columns=features)
fakedf['fake']=1
realdf = pd.DataFrame(map(process_entries, realdata), columns=features)
realdf['fake']=0
df = pd.concat([fakedf, realdf])
df=df.sample(frac=1).reset_index(drop=True)
df.head()

## Oversampling imbalanced data

In [None]:
from imblearn.over_sampling import SMOTE
X, y = SMOTE().fit_resample(df.drop(columns=['fake']), df['fake'])

## Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, cross_val_score
scoring = {'acc': 'accuracy',
           'prec_macro': 'precision_macro',
           'rec_micro': 'recall_macro'}

cross_validate(RandomForestClassifier(), X, y, cv=5, scoring=scoring)

In [None]:
import pickle

model=RandomForestClassifier()
model.fit(X, y)
pickle.dump(model, open('model', 'wb+'))

## Test on realdata

In [None]:
def predict(username):
    np.random.seed(1111)
    info = get_info(username)
    if info is None:
        return 404
    return model.predict_proba(pd.DataFrame([process_entries(info)], columns=features))[0][1]

In [None]:
A=dict(zip(*(model.feature_names_in_, model.feature_importances_)))
{k:A[k] for k in sorted(A, key=A.get, reverse=True)}

### Run the webapp

In [None]:
! source env/bin/activate && streamlit run infer.py