In [None]:
import pandas as pd
import requests
import numpy as np
import json

from sklearn.model_selection import GroupShuffleSplit
import xgboost as xgb
import shap

# model is trained on all the words from the list 
# car_accident_lawyer|personal_injury_lawyer
word_list = "car_accident_lawyer"

url = "http://209.126.84.138:80/api/v1/fulldata?word_list=" + word_list + "&top_results=20"
payload={}
headers = {}
rows = requests.request("GET", url, headers=headers, data=payload).json()[0]["rows"]
df = pd.DataFrame(rows)

In [None]:
#### feature engineering

# outliers in backlinks are removed from the dataset
median_backlinks = df["backlinks"].median()
std_backlinks = df["backlinks"].std()
outliers_backlinks = (df['backlinks'] - median_backlinks).abs() > std_backlinks
df.loc[outliers_backlinks, "backlinks"] = median_backlinks

# outliers in refdomains are removed from the dataset
median_refdomains = df["refdomains"].median()
std_refdomains = df["refdomains"].std()
outliers_refdomains = (df["refdomains"] - median_refdomains).abs() > std_refdomains
df.loc[outliers_refdomains, "refdomains"] = median_refdomains

# getting rid of highly correlated parameters
del df["traffic"]
del df["traffic_value"]
del df["organic_traffic"]
del df["organic_keywords"]

# removing columns with same values
del df["total_dom_size"]
del df["page_size"]
del df["word_count"]
del df["referring_domains"]

In [None]:
gss = GroupShuffleSplit(test_size=.30, n_splits=2, random_state=40).split(df, groups=df['city'])
X_train_inds, X_test_inds = next(gss)
train_data= df.iloc[X_train_inds]

X_train = train_data.loc[:, ~train_data.columns.isin(['city','rank'])]
y_train = train_data.loc[:, train_data.columns.isin(['rank'])]

groups = train_data.groupby('city').size().to_frame('size')['size'].to_numpy()
test_data= df.iloc[X_test_inds]

X_test = test_data.loc[:, ~test_data.columns.isin(['rank'])]
y_test = test_data.loc[:, test_data.columns.isin(['rank'])]

model = xgb.XGBRanker(  
  booster='gbtree',
  objective='rank:pairwise',
  random_state=42, 
  learning_rate=0.1,
  colsample_bytree=0.9, 
  eta=0.05, 
  max_depth=10, 
  n_estimators=110, 
  subsample=0.75 
)

del X_train["website"]
model.fit(X_train, y_train, group=groups, verbose=True)

del X_test["website"]
del X_test["city"]

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# dataframe that can be exported in a table
vals = np.abs(shap_values).mean(0)
feature_importance = pd.DataFrame(list(zip(X_train.columns,vals)),columns=['col_name','feature_importance_vals'])
feature_importance.sort_values(by=['feature_importance_vals'],ascending=False,inplace=True)

# JSON output from the table to the api endpoint
feature_importance_json = json.loads(feature_importance.to_json(orient='records'))
res = {}
for i in feature_importance_json:
    res[i["col_name"]] = i["feature_importance_vals"]