#***BANA 6350 : Quantitative Methods***

**Statistical Experiments and Significance Testing**

University of Dallas, Irving, TX

Credits: "Practical Statistics for Data Scientists" 2nd Edition


##**Mounting Google Drive**

In [None]:
# Mount google drive

from google.colab import drive
drive.mount('/content/gdrive/',force_remount=True)

In [None]:
# Right click on the BANA6350>Data folder and copy the folder path by click "Copy Path". Then paste that inside the code below to link your folder where all the data will reside

import os

path = "/content/gdrive/MyDrive/BANA6350/Data"

os.chdir(path)

# the above code will change your current working directory to the path i.e., BANA6350/Data folder


In [None]:
# Let's try opening a file inside our Current working directory:

import pandas as pd
pd.read_csv('state.csv').head()

##**Setting up formatting**

In [None]:
import matplotlib.pyplot as plt

plt.rcParams['lines.linewidth'] = 3
plt.rcParams['figure.figsize'] = [14.0, 6.0]
plt.rcParams['font.size']= 18
plt.style.available   # Check what styles are available for Chart formats by visiting : https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html
plt.style.use('fivethirtyeight')       # Assigning the FiveThirtyEight format, you can choose any of the names from the above link

##**Importing Commonly used python packages:**

In [None]:
# import some standard packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# below are additional libraries for today's lecture

from scipy import stats
import random
import statsmodels.formula.api as smf
import statsmodels.api as sm

In [None]:
imanishi = pd.read_csv('imanishi_data.csv')
imanishi

In [None]:
imanishi.columns = [c.strip() for c in imanishi.columns]      # strip() Removes spaces at the beginning and at the end of the string:
imanishi.columns

In [None]:
#Visualization
ax = imanishi.plot.bar(x='Digit', y=['Frequency'], legend=False,figsize=(8, 8))
ax.set_xlabel('Digit')
ax.set_ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
#Data Creation
data = [['0', 0.315, 14], ['1', 0.315, 71], ['2', 0.315, 7], ['3', 0.315, 65], ['4', 0.315, 23],['5', 0.315, 19], ['6', 0.315, 12],['7', 0.315, 45], ['8', 0.315, 53], ['9',0.315, 6]]
data

In [None]:
df = pd.DataFrame(data, columns = ['Digit', 'Prob_Dist', 'Observed Frequency' ])
df

In [None]:
df['expected_freq'] = df['Observed Frequency'].sum() * df['Prob_Dist']
df

In [None]:
# significance level

alpha = 0.05


# Calcualtion of Chisquare

chi_square = 0
for i in range(len(df)):
    O = df.loc[i, 'Observed Frequency']
    E = df.loc[i, 'expected_freq']
    chi_square += (O-E)**2/E

In [None]:
# The p-value approach

# Null Hypothesis:      Imanishi-Kazi fabricated data in her research
# Alternate Hypothesis: Imanishi-kari did not fabricate data in her research

# p value is <= alpha, Reject the null hypothesis
# p value is > alpha,  fail to reject the null hypothesis


print("\n--------------------------------------------------------------------------------------")
print("Approach 1: The p-value approach to hypothesis testing in the decision rule")
p_value = 1 - stats.chi2.cdf(chi_square, df['Digit'].nunique() - 1)
conclusion = "Failed to reject the null hypothesis."
if p_value <= alpha:
    conclusion = "Null Hypothesis is rejected."

print("chisquare-score is:", chi_square, " and p value is:", p_value)
print(conclusion)
print("\n--------------------------------------------------------------------------------------")


# Conclusion: We do not have enough evidence that the observed and expected frequencies are not equal,
# at 5% significance level. It means that the obsserved and expected frequencies are the similar.