# Testing of Hypothesis

In [None]:
import pandas as pd
from scipy import stats
from statsmodels.stats import weightstats as stests

### Test for Single Mean

In [None]:
sales_sample=sales_data.sample(n=100,random_state=55)

ztest ,pval = stests.ztest(sales_sample["Customers"], x2=None, value=700)
print(ztest)
print("P-Value", float(pval))

if pval<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

### Test for Two Means

In [None]:
sales_sample=sales_data.sample(n=100,random_state=55)

ztest ,pval = stests.ztest(sales_sample["Sales"][sales_sample["Promo"]==0],
                           sales_sample["Sales"][sales_sample["Promo"]==1])
print(ztest)
print("P-Value", float(pval))

if pval<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

### Testing Multiple Means

In [None]:
sales_sample=sales_data.sample(n=300,random_state=55)

F, p = stats.f_oneway(sales_sample["Sales"][sales_sample["StoreType"]=="c"],
                      sales_sample["Sales"][sales_sample["StoreType"]=="b"],
                      sales_sample["Sales"][sales_sample["StoreType"]=="c"],
                      sales_sample["Sales"][sales_sample["StoreType"]=="d"]
                      )
print("p-value for significance is: ", p)
if p<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")


In [None]:
F, p = stats.f_oneway(sales_sample["Customers"][sales_sample["DayOfWeek"]==2],
                      sales_sample["Customers"][sales_sample["DayOfWeek"]==3],
                      sales_sample["Customers"][sales_sample["DayOfWeek"]==4])
print("p-value for significance is: ", p)
if p<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

## IG For Continuous Target

In [None]:
def IG_Cal_Type2(input_df, target_col):
  result_DF=pd.DataFrame()
  from sklearn import tree
  import matplotlib.pyplot as plt
  from sklearn.tree import plot_tree, export_text

  for col in input_df.drop(target_col, axis=1).columns.values:
    features= [col]
    X = input_df[features]
    y = input_df[target_col]
    DTree = tree.DecisionTreeRegressor(criterion="mse" ,max_depth=1)
    DTree.fit(X,y)


    plot_tree(DTree, filled=True,
                     rounded=True,
                     impurity=True,
                     feature_names = features)
    plt.show()

    #MSE and Information Gain
    entropy_array=DTree.tree_.impurity
    node_size=DTree.tree_.weighted_n_node_samples

    information_gain=(entropy_array[0]-((node_size[1]/node_size[0])*entropy_array[1] + (node_size[2]/node_size[0])*entropy_array[2]))/entropy_array[0]
    print("Information_gain for the feature ", features, " is ", information_gain*100 )

    temp =pd.DataFrame({"Variable" : [col], "IG" :[information_gain*100]}, columns = ["Variable", "IG"])
    result_DF=pd.concat([result_DF,temp], axis=0)

  print("===================\n")
  print(result_DF.sort_values(by="IG", ascending=False))

Rossmann Store Sales Data

In [None]:
final_data=sales_data[["Promo",	"SchoolHoliday","DayOfWeek", "Sales"]]
IG_Cal_Type2(input_df=final_data, target_col="Sales")

# Categorical(X) vs Categorical(Y)

## Bar Chart for categorical variable comparison

In [None]:
sns.countplot(y="education",  data=bank_data[bank_data["y"]=="no"], color="red")
sns.countplot(y="education",  data=bank_data[bank_data["y"]=="yes"], color="green")

In [None]:
plt.figure()
for col in categorical_cols:
  sns.countplot(y=col,  data=bank_data[bank_data["y"]=="no"], color="red")
  sns.countplot(y=col,  data=bank_data[bank_data["y"]=="yes"], color="green")
  plt.title([col + " Bar plot"])
  plt.show()


In [None]:
plt.figure()
for col in categorical_cols:
  sns.countplot(y=col,  data=bank_data,hue="y")
  plt.title([col + " Bar plot"])
  plt.show()

## Cross Tables

In [None]:
for col in categorical_cols:
  print(pd.crosstab(bank_data[col], bank_data['y']))

In [None]:
for col in categorical_cols:
  print(pd.crosstab(bank_data[col], bank_data['y'], margins=True))
  print("============================\n")

## Chi-Square Test of Independence

In [None]:
bank_data_sample=bank_data.sample(300)
import scipy.stats as stats

for col in categorical_cols:
  crosstab =pd.crosstab(bank_data_sample[col], bank_data_sample['y'])
  print(crosstab)
  Chi=stats.chi2_contingency(crosstab)
  p=Chi[1]
  print("p-value for significance is: ",p )
  if p<0.05:
    print("reject null hypothesis")
  else:
    print("accept null hypothesis")
  print("============================\n")

## IG For Categorical Target

In [None]:
bank_data["y_num"]=bank_data["y"].map({"no":0, "yes":1})
bank_data["housing_num"]=bank_data["housing"].map({"no":0, "yes":1})
bank_data["loan_num"]=bank_data["loan"].map({"no":0, "yes":1})

In [None]:
final_data=bank_data[["loan_num","housing_num", "y_num"]]
IG_Cal(final_data, "y_num")

# Multivariate Effect Detection

In [None]:
def Multi_level_Tree(input_df, target_col, levels):
  from sklearn import tree
  import matplotlib.pyplot as plt
  from sklearn.tree import plot_tree, export_text

  features= input_df.drop(target_col, axis=1).columns.values
  X = input_df[features]
  y = input_df[target_col]
  DTree = tree.DecisionTreeClassifier(criterion="entropy" ,max_depth=levels)
  DTree.fit(X,y)
  plt.figure(figsize=(15,7))
  plot_tree(DTree, filled=True,
                    rounded=True,
                    impurity=True,
                    feature_names = features)
  plt.show()
  print([[i, features[i]] for i in range(len(features))])
  print(export_text(DTree))

In [None]:
Multi_level_Tree(input_df=credit_risk, target_col="SeriousDlqin2yrs", levels=2)

In [None]:
Multi_level_Tree(input_df=credit_risk, target_col="SeriousDlqin2yrs", levels=3)

## Multivariate Interaction Effect for Regression

In [None]:
def Multi_level_Tree_Reg(input_df, target_col, levels):
  from sklearn import tree
  import matplotlib.pyplot as plt
  from sklearn.tree import plot_tree, export_text

  features= input_df.drop(target_col, axis=1).columns.values
  X = input_df[features]
  y = input_df[target_col]
  DTree = tree.DecisionTreeRegressor(criterion="mse" ,max_depth=levels)
  DTree.fit(X,y)
  plt.figure(figsize=(15,7))
  plot_tree(DTree, filled=True,
                    rounded=True,
                    impurity=True,
                    feature_names = features)
  plt.show()
  print([[i, features[i]] for i in range(len(features))])
  print(export_text(DTree))

In [None]:
sales_data.info()

In [None]:
final_data=sales_data[["Promo",	"SchoolHoliday","DayOfWeek", "Sales","CompetitionDistance","Promo2"]]
Multi_level_Tree_Reg(input_df=final_data, target_col="Sales",levels=2)