# QA-MILB-Team-Based-Retention-Model

* Stellar Algo
* Nakiska Rad & Ryan Kazmerik
* August 23, 2021

In [4]:
import psycopg2
import numpy as np
import pandas as pd
import xgboost as xgb
import sys
import warnings

from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix

if not sys.warnoptions:
    warnings.simplefilter("ignore")

## Connecting to RedShift to get our training and testing dataset

In [5]:
conn=psycopg2.connect(
    dbname = 'datascience',
    host = 'sagemaker.cbpdnejrkweo.us-east-1.redshift.amazonaws.com',
    port = 5439,
    user = 'xerris',
    password='ThisIsDataScience20!!',
    sslmode='require'
)

In [19]:
teamproductyear_id = 58 # need to dynamically populate this from a config json

In [23]:
cur = conn.cursor()

query = f'''
    SELECT 
        teamproductyearid,
        lkupclientid,
        clientcode,
        productgrouping,
        trainseasonyear,
        testseasonyear,
        facttestprevyear 
    FROM 
        ds.productyear_all r 
    WHERE teamproductyearid ={teamproductyear_id} ;
'''

bnew = cur.execute(query)
pnew = cur.fetchall()

df_params = pd.DataFrame(pnew)

cols = ['teamproductyearid','lkupclientid','clientcode','productgrouping','trainseasonyear','testseasonyear','facttestprevyear']

df_params = pd.DataFrame(pnew, columns=cols)
df_params.head()

Unnamed: 0,teamproductyearid,lkupclientid,clientcode,productgrouping,trainseasonyear,testseasonyear,facttestprevyear
0,58,47,okcdodgers,Full Season,2019,2021,


In [24]:
client_id = dfparam3._get_value(0,'lkupclientid')
client_code= dfparam3._get_value(0,'clientcode')
product_grouping =dfparam3._get_value(0,'productgrouping') 
train_season_year =dfparam3._get_value(0,'trainseasonyear') 
test_season_year =dfparam3._get_value(0,'testseasonyear') 

In [26]:
sample_query = f'''
    SELECT 
        r.dimcustomermasterid,
        recency,
        attendancePercent,
        totalSpent,
        distToVenue,
        source_tenure,
        renewedBeforeDays,
        missed_games_1,
        missed_games_2,
        missed_games_over_2,
        isnextyear_buyer
    FROM 
        ds.retentionscoring r 
    WHERE 
        lkupclientid ={client_id} 
    AND 
        productgrouping in({"'"+ str(product_grouping) + "'"})
    AND year<{train_season_year};'''
b=cur.execute(sample_query)
p = cur.fetchall()
df = pd.DataFrame(p)

cols2 = ['dimcustomermasterid','recency','attendancePercent','totalSpent','distToVenue','source_tenure','renewedBeforeDays','missed_games_1','missed_games_2','missed_games_over_2','isnextyear_buyer']

df_features = pd.DataFrame(p, columns=cols2)
df_features.head()

Unnamed: 0,dimcustomermasterid,recency,attendancePercent,totalSpent,distToVenue,source_tenure,renewedBeforeDays,missed_games_1,missed_games_2,missed_games_over_2,isnextyear_buyer
0,282250392,0,0.289286,3360,309.203,380,234,0,1,5,1
1,298633975,1,0.421429,8680,309.203,370,224,4,3,8,1
2,306117522,0,0.303571,3360,5.79,366,220,2,0,6,1
3,306117765,6,0.335714,4340,10.35,379,233,3,3,7,1
4,300597778,0,0.457143,6720,14.07,372,226,1,1,7,1


In [29]:
df_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254 entries, 0 to 253
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   dimcustomermasterid  254 non-null    object
 1   recency              254 non-null    int64 
 2   attendancePercent    254 non-null    object
 3   totalSpent           254 non-null    object
 4   distToVenue          254 non-null    object
 5   source_tenure        254 non-null    int64 
 6   renewedBeforeDays    254 non-null    int64 
 7   missed_games_1       254 non-null    int64 
 8   missed_games_2       254 non-null    int64 
 9   missed_games_over_2  254 non-null    int64 
 10  isnextyear_buyer     254 non-null    int64 
dtypes: int64(7), object(4)
memory usage: 22.0+ KB


In [30]:
df_features['dimcustomermasterid']= pd.to_numeric(df_features['dimcustomermasterid'])
df_features['attendancePercent']= pd.to_numeric(df_features['attendancePercent'])
df_features['totalSpent']= pd.to_numeric(df_features['totalSpent'])
df_features['distToVenue']= pd.to_numeric(df_features['distToVenue'])

In [33]:
X = df_features.drop(['isnextyear_buyer'], axis=1).copy()
X.head()

Unnamed: 0,dimcustomermasterid,recency,attendancePercent,totalSpent,distToVenue,source_tenure,renewedBeforeDays,missed_games_1,missed_games_2,missed_games_over_2
0,282250392,0,0.289286,3360,309.203,380,234,0,1,5
1,298633975,1,0.421429,8680,309.203,370,224,4,3,8
2,306117522,0,0.303571,3360,5.79,366,220,2,0,6
3,306117765,6,0.335714,4340,10.35,379,233,3,3,7
4,300597778,0,0.457143,6720,14.07,372,226,1,1,7


In [34]:
y = df_features['isnextyear_buyer'].copy()
y.head()

0    1
1    1
2    1
3    1
4    1
Name: isnextyear_buyer, dtype: int64

In [37]:
clf= xgb.XGBClassifier(objective='binary:logistic',seed=42,gamma=0.25,lear_rate=0.1,max_depth=6,reg_lambda=20,scale_pos_weight=3,subsample=0.9,colsample_bytree=0.5)
clf.fit(X,y,verbose=True,early_stopping_rounds=10,eval_metric='aucpr',eval_set=[(X,y)])

Parameters: { "lear_rate" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-aucpr:0.83255
[1]	validation_0-aucpr:0.86710
[2]	validation_0-aucpr:0.86621
[3]	validation_0-aucpr:0.94220
[4]	validation_0-aucpr:0.95205
[5]	validation_0-aucpr:0.95165
[6]	validation_0-aucpr:0.95351
[7]	validation_0-aucpr:0.95267
[8]	validation_0-aucpr:0.96763
[9]	validation_0-aucpr:0.98085
[10]	validation_0-aucpr:0.98158
[11]	validation_0-aucpr:0.98426
[12]	validation_0-aucpr:0.98559
[13]	validation_0-aucpr:0.98594
[14]	validation_0-aucpr:0.98635
[15]	validation_0-aucpr:0.98758
[16]	validation_0-aucpr:0.98840
[17]	validation_0-aucpr:0.98879
[18]	validation_0-aucpr:0.98936
[19]	validation_0-aucpr:0.98964
[20]	validation_0-aucpr:0.98989
[21]	validation_0-aucpr:0.99046
[22]	validation_0-a

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.25, gpu_id=-1,
              importance_type='gain', interaction_constraints='', lear_rate=0.1,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=20, scale_pos_weight=3, seed=42,
              subsample=0.9, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [38]:
# check Important features
feature_importances_df = pd.DataFrame(
    {"feature": list(X.columns), "importance": clf.feature_importances_}
).sort_values("importance", ascending=False)

# Display
feature_importances_df

Unnamed: 0,feature,importance
4,distToVenue,0.243346
0,dimcustomermasterid,0.148731
9,missed_games_over_2,0.10465
5,source_tenure,0.089571
2,attendancePercent,0.089465
1,recency,0.085414
3,totalSpent,0.07958
6,renewedBeforeDays,0.068579
7,missed_games_1,0.047534
8,missed_games_2,0.04313


In [39]:
feature_importances=feature_importances_df[['feature','importance']]
feature_importances['productgrouping'] = product_grouping
feature_importances=feature_importances[['feature','importance']]
feature_importances.drop([0],axis=0,inplace=True) 
feature_importances

Unnamed: 0,feature,importance
4,distToVenue,0.243346
9,missed_games_over_2,0.10465
5,source_tenure,0.089571
2,attendancePercent,0.089465
1,recency,0.085414
3,totalSpent,0.07958
6,renewedBeforeDays,0.068579
7,missed_games_1,0.047534
8,missed_games_2,0.04313


In [None]:
feature_importance_dict = {}
for ind in feature_importances.index:
     feature_importance_dict[feature_importances['feature'][ind]] = float(feature_importances['importance'][ind])
print(feature_importance_dict)

In [None]:
# from sshtunnel import SSHTunnelForwarder
from sshtunnel import open_tunnel
from pymongo import MongoClient
import ssl
import datetime
SSH_HOST = '3.213.85.2'
SERVER_USER = 'ubuntu'
PRIVATE_KEY = '/Users/stellaralgo/.ssh/qaJump.pem'
    # define ssh tunnel
with open_tunnel(
    (SSH_HOST, 22),
    ssh_username=SERVER_USER,
    ssh_pkey=PRIVATE_KEY,
    remote_bind_address=('qa-docdb.cluster-cv8xdavkwzyq.us-east-1.docdb.amazonaws.com', 27017),
    local_bind_address=('0.0.0.0', 27017)
) as tunnel:
   # tunnel
    connection = MongoClient('mongodb://stellaradmin:Can7jRhPN7z6i4My@localhost:27017', ssl=True,
                             ssl_ca_certs='/Users/stellaralgo/.ssh/rds-combined-ca-bundle.pem', ssl_cert_reqs=ssl.CERT_NONE, retryWrites=False)
   # connection
    #for x in (connection.views.views_meta_data.find_one()):
    db = connection['views']
    collection = db['views_meta_data']
    myquery = { "_id": client_code }
    tenant_doc = collection.find_one(myquery)
    today = datetime.datetime.now()
    if 'date_last_retention_scores' not in tenant_doc:
        tenant_doc['date_last_retention_scores'] = {}
    tenant_doc['date_last_retention_scores']= today        
    collection.update_one(myquery, { '$set': tenant_doc },upsert=True)
 
    if 'attributes_std' not in tenant_doc:
        tenant_doc['attributes_std'] = {}
    tenant_doc['attributes_std'][product_grouping] = feature_importance_dict        
    collection.update_one(myquery, { '$set': tenant_doc },upsert=True)

       # print(x)
#connection.close()
#tunnel.close()
   # tunnel

In [None]:
# Importing the testing data
cur = conn.cursor()
sample_query2 = f'''select r.dimcustomermasterid,recency,attendancePercent,totalSpent,distToVenue,source_tenure,renewedBeforeDays,missed_games_1,missed_games_2,missed_games_over_2,isnextyear_buyer,isnextyear_samepkg_buyer,pkgupgrade_status from ds.retentionscoring r where lkupclientid ={client_id} and productgrouping in({"'"+ str(product_grouping) + "'"}) and year={test_season_year} ;'''
b2=cur.execute(sample_query2)
p2 = cur.fetchall()
df_test2 = pd.DataFrame(p2)
new_columns_test = ['dimcustomermasterid','recency','attendancePercent','totalSpent','distToVenue','source_tenure','renewedBeforeDays','missed_games_1','missed_games_2','missed_games_over_2','isnextyear_buyer','isnextyear_samepkg_buyer','pkgupgrade_status']
df_test = pd.DataFrame(p2,columns=new_columns)
df_test.drop(['isnextyear_samepkg_buyer','pkgupgrade_status'], axis=1, inplace=True)
df_test.head()

In [None]:
df_test.count()

In [None]:
df_test['dimcustomermasterid']= pd.to_numeric(df_test['dimcustomermasterid'])
df_test['attendancePercent']= pd.to_numeric(df_test['attendancePercent'])
df_test['totalSpent']= pd.to_numeric(df_test['totalSpent'])
df_test['distToVenue']= pd.to_numeric(df_test['distToVenue'])

In [None]:
X_test = df_test.drop(['isnextyear_buyer'], axis=1).copy()
X_test.head()

In [None]:
y_pred = clf.predict_proba(X)
#print(y_pred)

In [None]:
# make predictions for test data
y_pred_test = clf.predict_proba(X_test)
# y_pred_proba = clf.predict_proba(X_test)
#y_pred_test.head()

In [None]:
import numpy as np
# Creating the array to convert
array_y_pred_test = np.array(y_pred_test)

In [None]:
# Create the dataframe
df_y_pred_test = pd.DataFrame(array_y_pred_test)
df_y_pred_test.columns = ['nonbuyer','buyer']
#df_y_pred_test

In [None]:
result_test = pd.concat([df_y_pred_test, X_test], axis=1, join="inner")
#result_test

In [None]:
result_test = result_test.drop(['nonbuyer'], axis=1).copy()

In [None]:
result_test['buyer']= pd.to_numeric(result_test['buyer'])

In [None]:
import datetime
today = datetime.datetime.now()
date_time = today.strftime("%m-%d-%Y %H:%M:%S")
print(date_time)

In [None]:
newscors=result_test[['dimcustomermasterid','buyer']]
newscors.columns = ['dimcustomermasterid','buyer_score']
newscors['year'] = test_season_year
newscors['lkupclientid'] = client_id
newscors['productgrouping'] = product_grouping
newscors['insertDate'] = date_time
#newscors

In [None]:
import pyodbc
# connect to SQL Server.
server = '52.44.171.130' 
database = 'datascience' 
username = 'nrad' 
password = 'ThisIsQA123' 
cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
cursor = cnxn.cursor()
# Insert Dataframe into SQL Server:
for index, row in newscors.iterrows():
    cursor.execute("INSERT INTO ds.finalscore (dimcustomermasterid,buyer_score,year,lkupclientid,productgrouping,insertDate) values(" + str(row.dimcustomermasterid) + "," + str(round(row.buyer_score,4))+ ","+ str(row.year) + "," + str(row.lkupclientid)+ ","+"'"+str(row.productgrouping)+"'"+ "," +"'"+str(row.insertDate)+"'" + ")")
cnxn.commit()
cursor.close()

In [None]:
mongoscores=newscors[['dimcustomermasterid','buyer_score']]#dimcustomermasterid
mongoscores['id_tenant']= client_code
mongoscores['productgrouping'] = product_grouping
mongoscores['year'] = test_season_year
mongoscores['insertDate'] = datetime.datetime.now()
mongoscores.columns=['customerNumber','score','id_tenant','productgrouping','year','date']
mongoscores_dict = mongoscores.to_dict(orient='records')
#mongoscores_dict
#mongoscores.to_numpy()

In [None]:
feature_importancesdict=feature_importances.to_dict(orient='records')
feature_importancesdict
# aa = {feature_importancesdict['feature']: feature_importancesdict['importance']}
aa = [{sample_dict['feature']: sample_dict['importance']} for sample_dict in feature_importancesdict]
result = {}
for d in aa:
    result.update(d)
#result

In [None]:
## test
final_list = []
for single_dict in mongoscores_dict:
    temp_dict = {}
    temp_dict2 = {}
    temp_dict['customerNumber']=single_dict['customerNumber']
    temp_dict['id_tenant']=single_dict['id_tenant']
    temp_dict['productgrouping']=single_dict['productgrouping']
    temp_dict['year']=single_dict['year']
    temp_dict2 = {
        'score': single_dict['score'], 
        'date': single_dict['date'],
        'attribute': result
    }
    temp_dict['history']= temp_dict2
    final_list.append(temp_dict)
    
#final_list


In [None]:
# from sshtunnel import SSHTunnelForwarder
from sshtunnel import open_tunnel
from pymongo import MongoClient
import ssl
import datetime
SSH_HOST = '3.213.85.2'
SERVER_USER = 'ubuntu'
PRIVATE_KEY = '/Users/stellaralgo/.ssh/qaJump.pem'

#historydata= result_test[['buyer','insertDate']]
    # define ssh tunnel
with open_tunnel(
    (SSH_HOST, 22),
    ssh_username=SERVER_USER,
    ssh_pkey=PRIVATE_KEY,
    remote_bind_address=('qa-docdb.cluster-cv8xdavkwzyq.us-east-1.docdb.amazonaws.com', 27017),
    local_bind_address=('0.0.0.0', 27017)
) as tunnel:
   # tunnel
    connection = MongoClient('mongodb://stellaradmin:Can7jRhPN7z6i4My@localhost:27017', ssl=True,
                             ssl_ca_certs='/Users/stellaralgo/.ssh/rds-combined-ca-bundle.pem', ssl_cert_reqs=ssl.CERT_NONE, retryWrites=False)
      
    db = connection['views']
    collection = db['scores_retention'] 
    for i in final_list:
        myquery = {"customerNumber": i['customerNumber'],"id_tenant": i['id_tenant'],"product":i['productgrouping'] ,"year":i['year'] }
        tenant_doc = collection.find_one(myquery)
    
        if tenant_doc is None:
            myquery =  {
                "customerNumber": i['customerNumber'],
                "id_tenant": i['id_tenant'],
                "product":i['productgrouping'],
                "year":i['year'],
                "history": [i['history']]
            }
            collection.insert_one(myquery)
    
        else:

            tenant_doc['history'].append(i['history'])
            collection.update_one(myquery, { '$set': tenant_doc },upsert=True)
        
        
        #print(tenant_doc)
            #print(x)
#connection.close()
#tunnel.close()
   # tunnel

In [None]:
feature_importances2= feature_importances
feature_importances2

In [None]:
import datetime
today = datetime.datetime.now()
date_time = today.strftime("%m-%d-%Y %H:%M:%S")
print(date_time)

In [None]:
feature_importances2.at[1,'feature']='Recency'
feature_importances2.at[2,'feature']='Attendance'
feature_importances2.at[3,'feature']='Monetary'
feature_importances2.at[4,'feature']='Distance to Venue'
feature_importances2.at[5,'feature']='Tenure'
feature_importances2.at[6,'feature']='Time to Renew'
feature_importances2.at[7,'feature']='Missed Games Streak 1'
feature_importances2.at[8,'feature']='Missed Games Streak 2'
feature_importances2.at[9,'feature']='Missed Games Streak Over 2'
feature_importances2

In [None]:
feature_importances2['attrank']={1,2,3,4,5,6,7,8,9}
feature_importances2 ['lkupClientId'] = client_id
feature_importances2 ['modelVersnNumber'] = 2
feature_importances2 ['scoreDate'] = date_time
feature_importances2 ['loadId'] = 0
feature_importances2 ['product'] = product_grouping
feature_importances2.columns=['attribute','indexValue','attrank','lkupClientId','modelVersnNumber','scoreDate','loadId','product']
feature_importances2

In [None]:
import pyodbc
# connect to SQL Server.
server = '52.44.171.130' 
database = 'datascience' 
username = 'nrad' 
password = 'ThisIsQA123' 
cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
cursor = cnxn.cursor()
# Insert Dataframe into SQL Server:
# cursor.execute("INSERT INTO dbo.finalscore (dimcustomermasterid,buyer_score,lkupclientid,insertDate) values(1,1,1,null)")
for index, row in feature_importances2.iterrows():
    cursor.execute("INSERT INTO stlrMILB.dw.lkupRetentionAttributeImportance (attribute,product,indexValue,rank,lkupClientId,modelVersnNumber,scoreDate,loadId) values(" + "'" +str(row.attribute)+"'"+","+ "'"+str(product_grouping)+"'" +"," + str(round(row.indexValue,4)) + "," + str(row.attrank)+ ","+ str(row.lkupClientId) + "," + str(row.modelVersnNumber)+ "," + "'"+ str(row.scoreDate)+ "'"+ ","+ str(row.loadId)  + ")")
     #print("INSERT INTO stlrMLS.dw.lkupRetentionAttributeImportance (attribute,product,indexValue,rank,lkupClientId,modelVersnNumber,scoreDate,loadId) values(" + "'" +str(row.attribute)+"'"+","+ "'"+str(product_grouping)+"'" +"," + str(round(row.indexValue,4)) + "," + str(row.attrank)+ ","+ str(row.lkupClientId) + "," + str(row.modelVersnNumber)+ "," + "'"+ str(row.scoreDate)+ "'"+ ","+ str(row.loadId)  + ")")
    #cursor.execute("INSERT INTO stlrMLS.dw.lkupRetentionAttributeImportance (attribute,product,indexValue,rank,lkupClientId,modelVersnNumber,scoreDate,loadId) values(" + "'" +str(row.attribute)+"'"+","+ "'"+str(product_grouping)+"'" +"," + str(round(row.indexValue,4)) + "," + str(row.attrank)+ ","+ str(row.lkupClientId) + "," + str(row.modelVersnNumber)+ "," +str(row.scoreDate) + ","+ str(row.loadId)  + ")")
cnxn.commit()
cursor.close()