## This notebook will test the distributions using the Mann-Whitney U test

### We will be looking at the p-value and how it changes when manually manipulating the distribution

In [19]:
import pandas as pd
import numpy as np

# Connecting to mysql database using sqlalchemy. This allows us to insert and retrieve dataframes with ease
# import mysql.connector
from sqlalchemy import create_engine

# Using an f string to input the user and password
connstring = f'mysql+mysqlconnector://pthielma:pass@127.0.0.1:3306/claims'
engine = create_engine(connstring, echo=False)
dbConnection = engine.connect()

train = pd.read_sql("select * from claims.train_dataset", dbConnection);
test = pd.read_sql("select * from claims.test_dataset", dbConnection);
train.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,235,39,651861,07-01-2011,IL,100/300,500,1046.58,4000000,434982,...,1,NO,4950,450,900,3600,Chevrolet,Silverado,2010,N
1,35,35,930032,10-09-2002,IL,100/300,2000,1117.42,0,446158,...,2,NO,53190,5910,11820,35460,Volkswagen,Jetta,1996,N
2,50,44,525862,18-10-2000,OH,250/500,2000,1188.51,0,447469,...,2,NO,61100,6110,12220,42770,Dodge,Neon,2008,N
3,456,62,669800,24-06-2009,OH,250/500,1000,1395.77,0,611651,...,3,NO,66480,5540,11080,49860,Saab,92x,2012,Y
4,150,30,354481,17-11-2004,IN,100/300,1000,1342.02,0,608425,...,2,NO,4500,450,450,3600,Saab,93,1999,N


In [20]:
import plotly.express as px
fig = px.histogram(train, x="age")
fig.show()

In [21]:
fig = px.histogram(test, x="age")
fig.show()

In [24]:
from scipy.stats import mannwhitneyu

u, p = mannwhitneyu(list(train['age']), list(test['age']))
print('test' + str(p))

test0.3533890451351666


### Now, we will attempt to alter the distribution to ensure we can detect change

In [5]:
newage = train.age.apply(lambda x: x-3)

In [6]:
fig = px.histogram(x=newage)
fig.show()

In [7]:
fig = px.histogram(test, x="age")
fig.show()

In [16]:
u, p = mannwhitneyu(list(newage), list(test['age']))
print("{0:.20f}".format(p))

0.00000000002323837972


### Okay! It works with numeric. Lets try on categorical

In [9]:
u, p = mannwhitneyu(list(train['policy_state']), list(test['policy_state']))
print("{0:.5f}".format(p))

0.13367


### Cool, that works well!

#### Now, how to change the distribution for testing.. We will use a count encoder that counts the occurance and replaces with that number

In [10]:
import category_encoders as ce

encode = ce.CountEncoder()
train_code = encode.fit_transform(train)
test_code = encode.transform(test)
train_code.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,235,39,651861,1,258,279,500,1046.58,4000000,434982,...,1,276,4950,450,900,3600,64,19,2010,602
1,35,35,930032,1,258,279,2000,1117.42,0,446158,...,2,276,53190,5910,11820,35460,60,31,1996,602
2,50,44,525862,1,281,280,2000,1188.51,0,447469,...,2,276,61100,6110,12220,42770,66,28,2008,602
3,456,62,669800,1,281,280,1000,1395.77,0,611651,...,3,276,66480,5540,11080,49860,62,21,2012,198
4,150,30,354481,1,261,279,1000,1342.02,0,608425,...,2,276,4500,450,450,3600,62,18,1999,602


In [11]:
fig = px.histogram(train_code, x="policy_state")
fig.show()

In [12]:
fig = px.histogram(test_code, x="policy_state")
fig.show()

In [13]:
u, p = mannwhitneyu(list(train_code['policy_state']), list(test_code['policy_state']))
print("{0:.5f}".format(p))

0.13367


### Now we can see that the distributions are numeric for altering and still compare the same

#### Lets alter the train distribution

In [14]:
new = train_code.policy_state.apply(lambda x: x-10)

In [15]:
u, p = mannwhitneyu(list(new), list(test_code['policy_state']))
print("{0:.5f}".format(p))

0.00000


## It looks like we will be able to detect changes in the distributions. This may help us identify when to recalibrate the model!