In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_columns=None
pd.options.display.max_rows=None

import scipy.stats as stats

In [24]:
df_pop = pd.read_csv(r'application_train.csv\application_train.csv')

In [3]:
df_samp = pd.read_csv('application_train_40k.csv')

In [55]:
df_samp = df_pop.sample(40000)

In [56]:
df_pop.shape

(307511, 121)

In [57]:
df_samp.shape

(40000, 121)

### Remove extra column from sample

In [53]:
for col in df_samp.columns:
    if col not in df_pop.columns:
        print(col)

In [7]:
df_samp.drop('Unnamed: 0',axis=1,inplace=True)

### Remove ID column

In [54]:
df_pop.drop('SK_ID_CURR',axis=1,inplace=True)
df_samp.drop('SK_ID_CURR',axis=1,inplace=True)

KeyError: "['SK_ID_CURR'] not found in axis"

### nuniques and dtypes of Sample and Population

In [58]:
nunique_pop = df_pop.apply(lambda col:col.nunique())
nunique_sam = df_samp.apply(lambda col:col.nunique())

dtype_pop = df_pop.dtypes
dtype_samp = df_samp.dtypes

dx = pd.DataFrame({'Population nuniques':nunique_pop,
                   'Sample nuniques':nunique_sam,
                    'Population Dtype':dtype_pop,
                  'Sample Dtype ':dtype_samp},
                 index = df_pop.columns)
dx

Unnamed: 0,Population nuniques,Sample nuniques,Population Dtype,Sample Dtype
TARGET,2,2,int64,int64
NAME_CONTRACT_TYPE,2,2,object,object
CODE_GENDER,3,2,object,object
FLAG_OWN_CAR,2,2,object,object
FLAG_OWN_REALTY,2,2,object,object
CNT_CHILDREN,15,8,int64,int64
AMT_INCOME_TOTAL,2548,679,float64,float64
AMT_CREDIT,5603,3161,float64,float64
AMT_ANNUITY,13672,8121,float64,float64
AMT_GOODS_PRICE,1002,500,float64,float64


## Statistical tests between Sample and Population

### Numerical columns

In [59]:
def num_sample_pop_test(col,alpha = 0.05):
    pop = df_pop[col]
    samp = df_samp[col]
    
    pop_mean = pop.mean()
    samp_mean = samp.mean()
    
    pop_std = pop.std()
    samp_std = samp.std()
    
    x_bar = samp_mean
    n = len(samp)
    sigma = pop_std

    z_alpha_by_2 = stats.norm.isf(alpha/2)

    upper = x_bar + z_alpha_by_2 * sigma / n**0.5
    lower = x_bar - z_alpha_by_2 * sigma / n**0.5
    #print('95% Conf interval:', lower, upper)

    if lower<=samp_mean<=upper:
        print(f'{col} Sample reperesents Population')
        return (col,1)
    else:
        print(f'{col} Sample and Population are different')
        return (col,0)

In [60]:
num_cols = df_pop.select_dtypes(np.number)
l = []
for col in num_cols:
    l.append(num_sample_pop_test(col))

TARGET Sample reperesents Population
CNT_CHILDREN Sample reperesents Population
AMT_INCOME_TOTAL Sample reperesents Population
AMT_CREDIT Sample reperesents Population
AMT_ANNUITY Sample reperesents Population
AMT_GOODS_PRICE Sample reperesents Population
REGION_POPULATION_RELATIVE Sample reperesents Population
DAYS_BIRTH Sample reperesents Population
DAYS_EMPLOYED Sample reperesents Population
DAYS_REGISTRATION Sample reperesents Population
DAYS_ID_PUBLISH Sample reperesents Population
OWN_CAR_AGE Sample reperesents Population
FLAG_MOBIL Sample reperesents Population
FLAG_EMP_PHONE Sample reperesents Population
FLAG_WORK_PHONE Sample reperesents Population
FLAG_CONT_MOBILE Sample reperesents Population
FLAG_PHONE Sample reperesents Population
FLAG_EMAIL Sample reperesents Population
CNT_FAM_MEMBERS Sample reperesents Population
REGION_RATING_CLIENT Sample reperesents Population
REGION_RATING_CLIENT_W_CITY Sample reperesents Population
HOUR_APPR_PROCESS_START Sample reperesents Populat

In [61]:
[i for i in l if i[1]==0] #all numerical samples are statiscally representing population

[]

### Categorical Columns

In [62]:
def cat_samp_pop_test(col,alpha=0.05):
    observed = df_samp[col].value_counts()
    expected = df_pop[col].value_counts(normalize=True)*np.sum(observed)
    
    cats_not_in_samp = []
    for cat in expected.index:
        if cat not in observed.index:
            cats_not_in_samp.append(cat)
    observed = observed.append(pd.Series([0]*len(cats_not_in_samp),index=cats_not_in_samp,dtype=np.number))
    
    display(pd.DataFrame([observed,expected],index=['observed','expected']))
    
    chi2_stat,p_val = stats.chisquare(f_obs = observed, f_exp = expected)
    
    if p_val > alpha/2:
        print(f'{col} Sample represents the Population')
        return (col,1)
    else:
        print(f'{col} Sample and Population are different')
        return (col,0)

In [63]:
cat_cols = df_pop.select_dtypes(object)
l = []
for col in cat_cols:
    l.append(cat_samp_pop_test(col,alpha=0.1))

Unnamed: 0,Cash loans,Revolving loans
observed,36245.0,3755.0
expected,36191.485833,3808.514167


NAME_CONTRACT_TYPE Sample represents the Population


Unnamed: 0,F,M,XNA
observed,26384.0,13616.0,0.0
expected,26333.757166,13665.722527,0.520307


CODE_GENDER Sample represents the Population


Unnamed: 0,N,Y
observed,26444.0,13556.0
expected,26395.673651,13604.326349


FLAG_OWN_CAR Sample represents the Population


Unnamed: 0,Y,N
observed,27709.0,12291.0
expected,27746.909867,12253.090133


FLAG_OWN_REALTY Sample represents the Population


Unnamed: 0,Unaccompanied,Family,"Spouse, partner",Children,Other_B,Other_A,Group of people
observed,32202.0,5300.0,1497.0,435.0,231.0,110.0,40.0
expected,32313.679719,5220.226162,1478.342461,424.779667,230.137745,112.598467,35.235779


NAME_TYPE_SUITE Sample represents the Population


Unnamed: 0,Working,Commercial associate,Pensioner,State servant,Unemployed,Student,Businessman,Maternity leave
observed,20704.0,9219.0,7308.0,2767.0,1.0,1.0,0.0,0.0
expected,20652.789656,9315.699276,7201.303368,2823.053484,2.861686,2.34138,1.300766,0.650383


NAME_INCOME_TYPE Sample represents the Population


Unnamed: 0,Secondary / secondary special,Higher education,Incomplete higher,Lower secondary,Academic degree
observed,28436.0,9703.0,1344.0,494.0,23.0
expected,28407.56916,9737.928074,1336.797708,496.372487,21.33257


NAME_EDUCATION_TYPE Sample represents the Population


Unnamed: 0,Married,Single / not married,Civil marriage,Separated,Widow,Unknown
observed,25549.0,5949.0,3810.0,2571.0,2121.0,0.0
expected,25551.216054,5911.203176,3873.032184,2571.615324,2092.673108,0.260153


NAME_FAMILY_STATUS Sample represents the Population


Unnamed: 0,House / apartment,With parents,Municipal apartment,Rented apartment,Office apartment,Co-op apartment
observed,35478.0,1885.0,1473.0,652.0,362.0,150.0
expected,35493.754695,1930.337451,1454.647151,634.904117,340.410587,145.945999


NAME_HOUSING_TYPE Sample represents the Population


Unnamed: 0,Laborers,Sales staff,Core staff,Managers,Drivers,High skill tech staff,Accountants,Medicine staff,Security staff,Cooking staff,Cleaning staff,Private service staff,Low-skill Laborers,Waiters/barmen staff,Secretaries,Realty agents,HR staff,IT staff
observed,7099.0,4241.0,3522.0,2783.0,2387.0,1480.0,1215.0,1064.0,885.0,747.0,649.0,337.0,276.0,193.0,167.0,104.0,74.0,70.0
expected,7134.290915,4150.0563,3564.172082,2762.782792,2404.943534,1471.174403,1268.597049,1103.639357,868.871983,768.682162,601.526757,342.843103,270.577155,174.26565,168.706731,97.087168,72.783057,67.999801


OCCUPATION_TYPE Sample represents the Population


Unnamed: 0,TUESDAY,WEDNESDAY,MONDAY,THURSDAY,FRIDAY,SATURDAY,SUNDAY
observed,7136.0,6731.0,6567.0,6520.0,6515.0,4444.0,2087.0
expected,7011.261386,6755.40062,6596.70711,6580.707682,6547.79829,4403.354677,2104.770236


WEEKDAY_APPR_PROCESS_START Sample represents the Population


Unnamed: 0,Business Entity Type 3,XNA,Self-employed,Other,Medicine,Government,Business Entity Type 2,School,Trade: type 7,Kindergarten,Construction,Business Entity Type 1,Transport: type 4,Trade: type 3,Industry: type 9,Security,Industry: type 3,Industry: type 11,Housing,Military,Bank,Agriculture,Police,Postal,Security Ministries,Transport: type 2,Trade: type 2,Restaurant,Services,University,Industry: type 7,Transport: type 3,Industry: type 1,Industry: type 4,Electricity,Hotel,Trade: type 6,Telecom,Industry: type 5,Insurance,Culture,Emergency,Advertising,Industry: type 2,Trade: type 1,Legal Services,Industry: type 12,Realtor,Mobile,Cleaning,Transport: type 1,Industry: type 6,Industry: type 10,Religion,Trade: type 5,Trade: type 4,Industry: type 13,Industry: type 8
observed,8857.0,7308.0,5041.0,2175.0,1404.0,1363.0,1340.0,1201.0,1065.0,851.0,849.0,779.0,760.0,465.0,434.0,414.0,403.0,368.0,349.0,348.0,315.0,313.0,299.0,288.0,272.0,245.0,224.0,223.0,167.0,160.0,160.0,138.0,130.0,127.0,122.0,119.0,89.0,86.0,80.0,78.0,59.0,56.0,55.0,52.0,51.0,48.0,47.0,46.0,39.0,38.0,27.0,19.0,17.0,16.0,8.0,7.0,6.0,0.0
expected,8844.171428,7202.864288,4996.50419,2170.068713,1455.947917,1353.317442,1372.698863,1156.771628,1018.630228,894.927336,874.245149,778.37866,702.153744,454.227654,438.098149,422.358875,426.391251,351.727255,384.766724,342.62189,326.102156,319.208093,304.509432,280.575329,256.771302,286.688931,247.145631,235.568809,204.87072,172.611711,170.010178,154.400981,135.149637,114.07722,123.572815,125.654042,82.078365,75.054226,77.915912,77.655759,49.299049,72.842923,55.802882,59.575105,45.266673,39.673378,47.998283,51.510352,41.234297,33.819928,26.145406,14.568585,14.178355,11.056515,6.373756,8.324905,8.715135,3.12184


ORGANIZATION_TYPE Sample represents the Population


Unnamed: 0,reg oper account,reg oper spec account,org spec account,not specified
observed,9642.0,1514.0,754.0,750.0
expected,9614.546988,1573.123766,731.736957,740.592289


FONDKAPREMONT_MODE Sample represents the Population


Unnamed: 0,block of flats,specific housing,terraced house
observed,19592.0,170.0,161.0
expected,19570.478344,194.92068,157.600976


HOUSETYPE_MODE Sample represents the Population


Unnamed: 0,Panel,"Stone, brick",Block,Wooden,Mixed,Monolithic,Others
observed,8564.0,8481.0,1201.0,675.0,276.0,235.0,205.0
expected,8578.603427,8419.475789,1201.965741,696.524403,298.250658,231.0923,211.087683


WALLSMATERIAL_MODE Sample represents the Population


Unnamed: 0,No,Yes
observed,20745.0,307.0
expected,20749.018621,302.981379


EMERGENCYSTATE_MODE Sample represents the Population


In [64]:
[i for i in l if i[1]==0] #all catogorical are good

[]

### Numerical with 0,1s which represents as Categorical

In [65]:
num_col = df_pop.select_dtypes(np.number).columns
d = dx.loc[num_col]
d = d[d['Population nuniques']<5]
d

Unnamed: 0,Population nuniques,Sample nuniques,Population Dtype,Sample Dtype
TARGET,2,2,int64,int64
FLAG_MOBIL,2,2,int64,int64
FLAG_EMP_PHONE,2,2,int64,int64
FLAG_WORK_PHONE,2,2,int64,int64
FLAG_CONT_MOBILE,2,2,int64,int64
FLAG_PHONE,2,2,int64,int64
FLAG_EMAIL,2,2,int64,int64
REGION_RATING_CLIENT,3,3,int64,int64
REGION_RATING_CLIENT_W_CITY,3,3,int64,int64
REG_REGION_NOT_LIVE_REGION,2,2,int64,int64


In [66]:
num_cat_cols = d.index
#categorical test
l = []
for col in num_cat_cols:
    l.append(cat_samp_pop_test(col))

Unnamed: 0,0,1
observed,36696.0,3304.0
expected,36770.847222,3229.152778


TARGET Sample represents the Population


Unnamed: 0,1,0
observed,39999.0,1.0
expected,39999.869923,0.130077


FLAG_MOBIL Sample and Population are different


Unnamed: 0,1,0
observed,32692.0,7308.0
expected,32795.574792,7204.425208


FLAG_EMP_PHONE Sample represents the Population


Unnamed: 0,0,1
observed,32025.0,7975.0
expected,32025.260885,7974.739115


FLAG_WORK_PHONE Sample represents the Population


Unnamed: 0,1,0
observed,39926.0,74.0
expected,39925.336004,74.663996


FLAG_CONT_MOBILE Sample represents the Population


Unnamed: 0,0,1
observed,28646.0,11354.0
expected,28757.345266,11242.654734


FLAG_PHONE Sample represents the Population


Unnamed: 0,0,1
observed,37740.0,2260.0
expected,37731.203111,2268.796889


FLAG_EMAIL Sample represents the Population


Unnamed: 0,2,3,1
observed,29523.0,6240.0,4237.0
expected,29525.317794,6286.604382,4188.077825


REGION_RATING_CLIENT Sample represents the Population


Unnamed: 0,2,3,1
observed,29820.0,5690.0,4490.0
expected,29850.509413,5705.161767,4444.328821


REGION_RATING_CLIENT_W_CITY Sample represents the Population


Unnamed: 0,0,1
observed,39403.0,597.0
expected,39394.233052,605.766948


REG_REGION_NOT_LIVE_REGION Sample represents the Population


Unnamed: 0,0,1
observed,38009.0,1991.0
expected,37969.243377,2030.756623


REG_REGION_NOT_WORK_REGION Sample represents the Population


Unnamed: 0,0,1
observed,38384.0,1616.0
expected,38373.651674,1626.348326


LIVE_REGION_NOT_WORK_REGION Sample represents the Population


Unnamed: 0,0,1
observed,36828.0,3172.0
expected,36873.087467,3126.912533


REG_CITY_NOT_LIVE_CITY Sample represents the Population


Unnamed: 0,0,1
observed,30768.0,9232.0
expected,30781.85821,9218.14179


REG_CITY_NOT_WORK_CITY Sample represents the Population


Unnamed: 0,0,1
observed,32794.0,7206.0
expected,32817.817899,7182.182101


LIVE_CITY_NOT_WORK_CITY Sample represents the Population


Unnamed: 0,0,1
observed,39998.0,2.0
expected,39998.309004,1.690996


FLAG_DOCUMENT_2 Sample represents the Population


Unnamed: 0,1,0
observed,28529.0,11471.0
expected,28400.935251,11599.064749


FLAG_DOCUMENT_3 Sample represents the Population


Unnamed: 0,0,1
observed,39998.0,2.0
expected,39996.748084,3.251916


FLAG_DOCUMENT_4 Sample represents the Population


Unnamed: 0,0,1
observed,39408.0,592.0
expected,39395.403742,604.596258


FLAG_DOCUMENT_5 Sample represents the Population


Unnamed: 0,0,1
observed,36480.0,3520.0
expected,36477.784535,3522.215465


FLAG_DOCUMENT_6 Sample represents the Population


Unnamed: 0,0,1
observed,39990.0,10.0
expected,39992.325478,7.674522


FLAG_DOCUMENT_7 Sample represents the Population


Unnamed: 0,0,1
observed,36769.0,3231.0
expected,36744.961969,3255.038031


FLAG_DOCUMENT_8 Sample represents the Population


Unnamed: 0,0,1
observed,39853.0,147.0
expected,39844.168176,155.831824


FLAG_DOCUMENT_9 Sample represents the Population


Unnamed: 0,0,1
observed,39997.0,3.0
expected,39999.089463,0.910537


FLAG_DOCUMENT_10 Sample represents the Population


Unnamed: 0,0,1
observed,39835.0,165.0
expected,39843.517793,156.482207


FLAG_DOCUMENT_11 Sample represents the Population


Unnamed: 0,0,1
observed,40000.0,0.0
expected,39999.739847,0.260153


FLAG_DOCUMENT_12 Sample represents the Population


Unnamed: 0,0,1
observed,39855.0,145.0
expected,39858.996914,141.003086


FLAG_DOCUMENT_13 Sample represents the Population


Unnamed: 0,0,1
observed,39889.0,111.0
expected,39882.540787,117.459213


FLAG_DOCUMENT_14 Sample represents the Population


Unnamed: 0,0,1
observed,39945.0,55.0
expected,39951.611487,48.388513


FLAG_DOCUMENT_15 Sample represents the Population


Unnamed: 0,0,1
observed,39593.0,407.0
expected,39602.875995,397.124005


FLAG_DOCUMENT_16 Sample represents the Population


Unnamed: 0,0,1
observed,39987.0,13.0
expected,39989.333715,10.666285


FLAG_DOCUMENT_17 Sample represents the Population


Unnamed: 0,0,1
observed,39674.0,326.0
expected,39674.808381,325.191619


FLAG_DOCUMENT_18 Sample represents the Population


Unnamed: 0,0,1
observed,39984.0,16.0
expected,39976.195973,23.804027


FLAG_DOCUMENT_19 Sample represents the Population


Unnamed: 0,0,1
observed,39979.0,21.0
expected,39979.708043,20.291957


FLAG_DOCUMENT_20 Sample represents the Population


Unnamed: 0,0,1
observed,39990.0,10.0
expected,39986.602105,13.397895


FLAG_DOCUMENT_21 Sample represents the Population


In [67]:
#alpha = 0.05
[i for i in l if i[1]==0] #not good fit samples are:

[('FLAG_MOBIL', 0)]

In [68]:
l = []
for col in num_cat_cols:
    l.append(cat_samp_pop_test(col,alpha=0.1))

Unnamed: 0,0,1
observed,36696.0,3304.0
expected,36770.847222,3229.152778


TARGET Sample represents the Population


Unnamed: 0,1,0
observed,39999.0,1.0
expected,39999.869923,0.130077


FLAG_MOBIL Sample and Population are different


Unnamed: 0,1,0
observed,32692.0,7308.0
expected,32795.574792,7204.425208


FLAG_EMP_PHONE Sample represents the Population


Unnamed: 0,0,1
observed,32025.0,7975.0
expected,32025.260885,7974.739115


FLAG_WORK_PHONE Sample represents the Population


Unnamed: 0,1,0
observed,39926.0,74.0
expected,39925.336004,74.663996


FLAG_CONT_MOBILE Sample represents the Population


Unnamed: 0,0,1
observed,28646.0,11354.0
expected,28757.345266,11242.654734


FLAG_PHONE Sample represents the Population


Unnamed: 0,0,1
observed,37740.0,2260.0
expected,37731.203111,2268.796889


FLAG_EMAIL Sample represents the Population


Unnamed: 0,2,3,1
observed,29523.0,6240.0,4237.0
expected,29525.317794,6286.604382,4188.077825


REGION_RATING_CLIENT Sample represents the Population


Unnamed: 0,2,3,1
observed,29820.0,5690.0,4490.0
expected,29850.509413,5705.161767,4444.328821


REGION_RATING_CLIENT_W_CITY Sample represents the Population


Unnamed: 0,0,1
observed,39403.0,597.0
expected,39394.233052,605.766948


REG_REGION_NOT_LIVE_REGION Sample represents the Population


Unnamed: 0,0,1
observed,38009.0,1991.0
expected,37969.243377,2030.756623


REG_REGION_NOT_WORK_REGION Sample represents the Population


Unnamed: 0,0,1
observed,38384.0,1616.0
expected,38373.651674,1626.348326


LIVE_REGION_NOT_WORK_REGION Sample represents the Population


Unnamed: 0,0,1
observed,36828.0,3172.0
expected,36873.087467,3126.912533


REG_CITY_NOT_LIVE_CITY Sample represents the Population


Unnamed: 0,0,1
observed,30768.0,9232.0
expected,30781.85821,9218.14179


REG_CITY_NOT_WORK_CITY Sample represents the Population


Unnamed: 0,0,1
observed,32794.0,7206.0
expected,32817.817899,7182.182101


LIVE_CITY_NOT_WORK_CITY Sample represents the Population


Unnamed: 0,0,1
observed,39998.0,2.0
expected,39998.309004,1.690996


FLAG_DOCUMENT_2 Sample represents the Population


Unnamed: 0,1,0
observed,28529.0,11471.0
expected,28400.935251,11599.064749


FLAG_DOCUMENT_3 Sample represents the Population


Unnamed: 0,0,1
observed,39998.0,2.0
expected,39996.748084,3.251916


FLAG_DOCUMENT_4 Sample represents the Population


Unnamed: 0,0,1
observed,39408.0,592.0
expected,39395.403742,604.596258


FLAG_DOCUMENT_5 Sample represents the Population


Unnamed: 0,0,1
observed,36480.0,3520.0
expected,36477.784535,3522.215465


FLAG_DOCUMENT_6 Sample represents the Population


Unnamed: 0,0,1
observed,39990.0,10.0
expected,39992.325478,7.674522


FLAG_DOCUMENT_7 Sample represents the Population


Unnamed: 0,0,1
observed,36769.0,3231.0
expected,36744.961969,3255.038031


FLAG_DOCUMENT_8 Sample represents the Population


Unnamed: 0,0,1
observed,39853.0,147.0
expected,39844.168176,155.831824


FLAG_DOCUMENT_9 Sample represents the Population


Unnamed: 0,0,1
observed,39997.0,3.0
expected,39999.089463,0.910537


FLAG_DOCUMENT_10 Sample and Population are different


Unnamed: 0,0,1
observed,39835.0,165.0
expected,39843.517793,156.482207


FLAG_DOCUMENT_11 Sample represents the Population


Unnamed: 0,0,1
observed,40000.0,0.0
expected,39999.739847,0.260153


FLAG_DOCUMENT_12 Sample represents the Population


Unnamed: 0,0,1
observed,39855.0,145.0
expected,39858.996914,141.003086


FLAG_DOCUMENT_13 Sample represents the Population


Unnamed: 0,0,1
observed,39889.0,111.0
expected,39882.540787,117.459213


FLAG_DOCUMENT_14 Sample represents the Population


Unnamed: 0,0,1
observed,39945.0,55.0
expected,39951.611487,48.388513


FLAG_DOCUMENT_15 Sample represents the Population


Unnamed: 0,0,1
observed,39593.0,407.0
expected,39602.875995,397.124005


FLAG_DOCUMENT_16 Sample represents the Population


Unnamed: 0,0,1
observed,39987.0,13.0
expected,39989.333715,10.666285


FLAG_DOCUMENT_17 Sample represents the Population


Unnamed: 0,0,1
observed,39674.0,326.0
expected,39674.808381,325.191619


FLAG_DOCUMENT_18 Sample represents the Population


Unnamed: 0,0,1
observed,39984.0,16.0
expected,39976.195973,23.804027


FLAG_DOCUMENT_19 Sample represents the Population


Unnamed: 0,0,1
observed,39979.0,21.0
expected,39979.708043,20.291957


FLAG_DOCUMENT_20 Sample represents the Population


Unnamed: 0,0,1
observed,39990.0,10.0
expected,39986.602105,13.397895


FLAG_DOCUMENT_21 Sample represents the Population


In [69]:
#alpha = 0.1
diff_cols = [i[0] for i in l if i[1]==0] #not good fit samples 
diff_cols
#Sample and Population proportion for below are different for alpha = 0.1

['FLAG_MOBIL', 'FLAG_DOCUMENT_10']

### Target vs Unbalanced Categorical Samples from above
Chi-Square Test for Independence
This test is used to test whether the categorical variables are independent or not.

𝐻0
: The variables are independent

𝐻1
: The variables are not independent (i.e. variables are dependent)

In [70]:
#lets see whether these columns are dependent or independent of TARGET
def target_vs_cat_test(col,alpha=0.05):
    table = pd.crosstab(df_pop['TARGET'],df_pop[col])
    test_stat, p, dof, expected_value = stats.chi2_contingency(observed = table, correction = False)
    if p < alpha/2:
        print(f'{col} is Dependent/Significant p_val:{p}')
        return (col,0)
    else:
        print(f'{col} is Independent/NotSignificant p_val:{p}')
        return (col,1)

In [71]:
l = []
for col in diff_cols:
    l.append(target_vs_cat_test(col))

FLAG_MOBIL is Independent/NotSignificant p_val:0.7669689262880788
FLAG_DOCUMENT_10 is Independent/NotSignificant p_val:0.4330079310907313


In [72]:
sig_cols = [i[0] for i in l if i[1]==0] #Significant Columns 
sig_cols

[]

#### After taking different random samples and running the statistical tests on these samples, we got a sample which statisfied most of the Statistical tests. Saving it for continuing further as good sample.

In [73]:
#Save the Random sample

In [74]:
df_samp.to_csv('sample_40k_good_fit.csv')