In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re

import scipy.stats as stats

import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the "../input/" directory.
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

%matplotlib inline



**T-Test**

The T-test is a statistical test used to determine whether a numeric data sample differs significantly from the population or whether two samples differ from one another.

**Purpose:
**
* It tells us - Is there a significant difference between two sets of data
* It lets us know if those differences could have happened by chance.
* It doesn't look at only mean but spread of standard deviation to derive conclusion how significant two data sets are

**T- Test gives two values in python **

 **T- Score** 
*          The t score is a ratio between the difference between two groups and the difference within the groups.
*           A large t-score tells us that the groups are different.
*           A small t-score tells us that the groups are similar.
*           It tells us how much the sample mean deviates from the **null hypothesis**.


**P- Value**
 *          The pvalue is the Probability that tells that result from sample data occured by chance.
 *          The pvalue tells the number of times null hypothesis is true
 *          **Low** p-value is good; It indicates  data did not occur by chance.
 *          For example, a p-value of .01 means there is only a 1% probability that the results from an experiment 
             happened by  chance.
 *          Usually it is accepted that if the p-value is lower than  **significance level** α (equal to 0.05) ,
             then we should reject the null hypothesis.

**Statistical Hypothesis**

**Null Hypothesis : **
        It assumes there is no statistically siginificant differece for the population being tested. 
        For instance, if we wanted to test whether the average age of students in a local school differs from the national 
        average, the null hypothesis would be that there is **no difference** between the average ages.

**Alternative Hypothesis:**
        It states that in fact there is significant difference for the population being tested.
        For instance, the alternative hypothesis would be that the average age of students  in a local school does in fact differ 
        from the national average.
      
 **Significance level (α):**
         The significance level is a probability threshold that determines when to reject the null hypothesis. Usually 
         accepted significance level is 0.05. This means that there is 95% confidence that the conclusion of the test will be valid.
         
 ** Types of t-tests**
     There are three main types of t-test:
     
*       An Independent Samples t-test compares the means for two groups.
*       A Paired sample t-test compares means from the same group at different times (say, one year apart).
*       A One sample t-test tests the mean of a single group against a known mean.   


In [None]:
# Read data in pandas data frame
df = pd.read_csv('../input/museums.csv', low_memory=False)
df.head()

In [None]:
df.dtypes

In [None]:
df['Museum Type'].value_counts()

In [None]:
# Add variable Type to distinguish Zoo and Others
df['Type'] = df['Museum Type'].apply(lambda x: 'ZOO' if re.search('ZOO', x) else 'OTHERS')

In [None]:
df['Type'].value_counts()

In [None]:
df.groupby(['Museum Type', 'Type']).size()

In [None]:
df['Revenue'].isnull().sum()

In [None]:
# mean impute
df['Revenue'].fillna(df['Revenue'].mean(), inplace=True)

In [None]:
df['Revenue'].isnull().sum()

In [None]:
df[df['Revenue'] == 0]['Revenue'].count()

In [None]:
# Create data point 
x = df[df['Type'] == 'ZOO']['Revenue']
y = df[df['Type'] == 'OTHERS']['Revenue']

In [None]:
x = x[x>0]
y = y[y>0]

In [None]:
# Quantile-Quantile Plot using SciPy (QQ)
# qq plot is used to check whether the data is distributed normally or not.

plt.subplot(221)
stats.probplot(x, dist="norm", plot=plt)
plt.subplot(222)
stats.probplot(y, dist="norm", plot=plt)

plt.show()

In [None]:
x = (x+1).apply(np.log)
y = (y+1).apply(np.log)

In [None]:
# QQ plot with logrithmic data
plt.subplot(221)
stats.probplot(x, dist="norm", plot=plt)
plt.subplot(222)
stats.probplot(y, dist="norm", plot=plt)

plt.show()

In [None]:
plt.figure(figsize = (15,7))
sns.distplot(x)
sns.distplot(y)

In [None]:
zoo = df[df['Type'] == 'ZOO']['Revenue']
others = df[df['Type'] == 'OTHERS']['Revenue']

zoo = zoo[zoo > 0]
others = others[others > 0]

zoo = (zoo+1).apply(np.log)
others = (others+1).apply(np.log)


In [None]:
print('Mean for Zoo:  {}'.format(zoo.mean()))
print('Mean for others:  {}'.format(others.mean()))

In [None]:
# T-test to check whether the revenue for zoos is different than all other types of museums 

# Two-Sample T-Test

stats.ttest_ind(a= zoo,
                b= others,
                equal_var=False) 

p-value is too low, so we can reject the null hypothesis and data is significantly different

**Try other Types of Test**

**One-Sample T-Test**
A one-sample t-test checks whether a sample mean differs from the population mean. 

In [None]:
# Data Population
revpop = df[df['Revenue'] > 0 ]['Revenue']

In [None]:
revpop = (revpop+1).apply(np.log)

In [None]:
print('Mean for Zoo:  {}'.format(zoo.mean()))
print('Mean for All Population:  {}'.format(revpop.mean()))

In [None]:
stats.ttest_1samp(a= zoo,               # Sample data
                 popmean= revpop.mean())  # Pop mean