In [3]:
import pandas as pd
pd.set_option('display.max_columns', 500)
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import statsmodels.stats.api as sms
import scipy 
sns.set_theme(style="darkgrid")

### A physician is evaluating a new diet for her patients with a family history of heart disease. To test the effectiveness of this diet, 16 patients are placed on the diet for 6 months. Their weights and triglyceride levels are measured before and after the study, and the physician wants to know if either set of measurements has changed.

In [5]:
dietstudy = pd.read_csv("dietstudy.csv")

In [6]:
dietstudy.head()

Unnamed: 0,patid,age,gender,tg0,tg1,tg2,tg3,tg4,wgt0,wgt1,wgt2,wgt3,wgt4
0,1,45,Male,180,148,106,113,100,198,196,193,188,192
1,2,56,Male,139,94,119,75,92,237,233,232,228,225
2,3,50,Male,152,185,86,149,118,233,231,229,228,226
3,4,46,Female,112,145,136,149,82,179,181,177,174,172
4,5,64,Male,156,104,157,79,97,219,217,215,213,214


In [7]:
X1, X2 = dietstudy.tg0, dietstudy.tg4
stats.ttest_rel(X1, X2)

Ttest_relResult(statistic=1.2000008533342437, pvalue=0.24874946576903698)

In [8]:
print(X1.mean(),X2.mean())

138.4375 124.375


#### The statistical analysis indicates that a difference of this magnitude, given the sample sizes is likely to occur nearly 25% of times if the experiment is carried out maintaining the same procedure as above. Hence no significant difference in triglyceride levels.

In [9]:
X1, X2 = dietstudy.wgt0, dietstudy.wgt4
stats.ttest_rel(X1, X2)

Ttest_relResult(statistic=11.174521688532522, pvalue=1.137689414996614e-08)

In [10]:
print(X1.mean(),X2.mean())

198.375 190.3125


#### The statistical analysis indicates that a difference of this magnitude, given the sample sizes is highly unlikely to occur. Hence a significant difference in the weights of patients.

---

### An analyst at a department store wants to evaluate a recent credit card promotion. To this end, 500 cardholders were randomly selected. Half received an ad promoting a reduced interest rate on purchases made over the next three months, and half received a standard seasonal ad. Is the promotion effective to increase sales? 

In [20]:
creditpromo = pd.read_csv("creditpromo.csv")

In [21]:
creditpromo.head()

Unnamed: 0,id,insert,dollars
0,148,Standard,2232.771979
1,572,New Promotion,1403.807542
2,973,Standard,2327.092181
3,1096,Standard,1280.030541
4,1541,New Promotion,1513.5632


In [22]:
a = creditpromo.query('insert == "Standard"')['dollars']
b = creditpromo.query('insert == "New Promotion"')['dollars']

In [23]:
creditpromo.groupby(['insert'])['dollars'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
insert,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
New Promotion,250.0,1637.499983,356.703169,745.650447,1407.266115,1661.078154,1884.878536,2637.884197
Standard,250.0,1566.389031,346.673047,544.597003,1361.288278,1547.361028,1788.361107,2713.027594


In [24]:
stats.ttest_ind(a,b,equal_var=False,alternative='less')

Ttest_indResult(statistic=-2.260422726464996, pvalue=0.012113174095824498)

#### With a p-value as low we have over 98% evidence against the null hypothesis and in favour of the alternate hypothesis that the New Promotion has been favourable

---

### An experiment is conducted to study the hybrid seed production of bottle gourd under open field conditions. The main aim of the investigation is to compare natural pollination and hand pollination. The data are collected on 10 randomly selected plants from each of natural pollination and hand pollination. The data are collected on fruit weight (kg), seed yield/plant (g) and seedling length (cm). 

#### Is the overall population of Seed yield/plant (g) equals to 200?
#### Test whether the natural pollination and hand pollination under open fieldconditions are equally effective or are significantly different.


In [25]:
pollination = pd.read_csv("pollination.csv")

In [26]:
pollination.head()

Unnamed: 0,Group,Fruit_Wt,Seed_Yield_Plant,Seedling_length
0,Natural,1.85,147.7,16.86
1,Natural,1.86,136.86,16.77
2,Natural,1.83,149.97,16.35
3,Natural,1.89,172.33,18.26
4,Natural,1.8,144.46,17.9


In [27]:
pollination.Seed_Yield_Plant.describe()

count     20.000000
mean     180.803500
std       37.311011
min      136.860000
25%      143.592500
50%      184.915000
75%      214.855000
max      233.840000
Name: Seed_Yield_Plant, dtype: float64

In [28]:
#By the confidence interval approach
data=pollination.Seed_Yield_Plant
stats.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=stats.sem(data)) 

(163.34140928109323, 198.2655907189068)

In [29]:
print(data.mean())
stats.ttest_1samp(data, 200)

180.8035


Ttest_1sampResult(statistic=-2.3009121248548645, pvalue=0.032891040921283025)

#### With over 95% confidence, we can state that the population mean for Seed yield/plant (g) is not equal to 200

---

In [95]:
pollination.groupby(['Group'])[['Fruit_Wt', 'Seed_Yield_Plant', 'Seedling_length']].agg(['mean','std'])

Unnamed: 0_level_0,Fruit_Wt,Fruit_Wt,Seed_Yield_Plant,Seed_Yield_Plant,Seedling_length,Seedling_length
Unnamed: 0_level_1,mean,std,mean,std,mean,std
Group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Hand,2.566,0.123756,215.598,11.763712,18.59,0.645532
Natural,1.848,0.034577,146.009,10.496087,17.707,0.888645


In [90]:
pollination.columns.values[1:]

array(['Fruit_Wt', 'Seed_Yield_Plant', 'Seedling_length'], dtype=object)

In [106]:
hand=pollination[pollination.Group == "Hand"].drop(columns=['Group'])
natural=pollination[pollination.Group == "Natural"].drop(columns=['Group'])

In [123]:
pd.DataFrame(stats.ttest_ind(hand, natural,equal_var=False),columns=pollination.columns[1:],index=['statistic','pvalue'])

Unnamed: 0,Fruit_Wt,Seed_Yield_Plant,Seedling_length
statistic,17.66999,13.95826,2.54223
pvalue,4.306871e-09,5.136161e-11,0.021431


#### For all the three parameters we can state with reasonable confidence that  pollination and hand pollination under open fieldconditions produce different results, close to a 100% confidence for the first two parameters and over 97% confidence for the third

---

---

### An electronics firm is developing a new DVD player in response to customer requests. Using a prototype, the marketing team has collected focus data for different age groups viz. Under 25; 25-34; 35-44; 45-54; 55-64; 65 and above. Do you think that consumers of various ages rated the design differently?

In [30]:
dvdplayer = pd.read_csv("dvdplayer.csv")

In [31]:
dvdplayer.head()

Unnamed: 0,agegroup,dvdscore
0,65 and over,38.454803
1,55-64,17.669677
2,65 and over,31.704307
3,65 and over,25.92446
4,Under 25,30.450007


In [33]:
stats.chisquare(dvdplayer.dvdscore)

Power_divergenceResult(statistic=112.36079483752984, pvalue=0.00043562854238649794)

#### The difference in ratings is clearly an affect of the difference in age groups

---

### A survey was conducted among 2800 customers on several demographic characteristics. Working status, sex, age, age-group, race, happiness, no. of child, marital status, educational qualifications, income group etc. had been captured for that purpose.
- Do you think educational qualification is somehow controlling the marital status? 
- Is happiness is driven by earnings or marital status?

In [34]:
sample_survey = pd.read_csv("sample_survey.csv")

In [35]:
sample_survey.head()

Unnamed: 0,id,wrkstat,marital,childs,age,educ,paeduc,maeduc,speduc,degree,sex,race,born,parborn,granborn,income,rincome,polviews,cappun,postlife,happy,hapmar,owngun,news,tvhours,howpaid,ethnic,eth1,eth2,eth3,confinan,conbus,coneduc,conpress,conmedic,contv,agecat,childcat,news1,news2,news3,news4,news5,car1,car2,car3
0,1,Working full time,Divorced,2.0,60.0,12.0,12.0,12.0,,High school,Male,White,Yes,Both in U.S.,0.0,$25000 or more,$25000 or more,Moderate,Favor,Yes,Pretty happy,,No,,2.0,Weekly wage,Scotland,,American Indian,Germany,Only some,Only some,Only some,Hardly any,Only some,Only some,55 to 64,1-2,No,No,No,No,No,American,Japanese,Japanese
1,2,Working part-time,Never married,0.0,27.0,17.0,20.0,,,Junior college,Female,White,Yes,Both in U.S.,0.0,$15000 - 19999,$15000 - 19999,Liberal,Oppose,,Pretty happy,,,Less than once a week,6.0,Hourly wage,,Norway,Denmark,Scotland,Hardly any,Only some,Only some,Only some,A great deal,A great deal,25 to 34,,No,No,Yes,No,No,American,German,Japanese
2,3,Working full time,Married,2.0,36.0,12.0,12.0,12.0,16.0,High school,Male,White,Yes,Both in U.S.,0.0,$25000 or more,$25000 or more,Conservative,,Yes,Very happy,Very happy,,Less than once a week,1.0,,England and Wales,England and Wales,,,A great deal,Only some,Hardly any,Hardly any,A great deal,Hardly any,35 to 44,1-2,No,No,No,Yes,Yes,American,American,
3,4,Working full time,Never married,0.0,21.0,13.0,,12.0,,High school,Male,White,Yes,Both in U.S.,1.0,$15000 - 19999,$15000 - 19999,Liberal,Oppose,Yes,Very happy,,No,Less than once a week,2.0,,French Canada,French Canada,,,,,,,,,Less than 25,,No,No,No,Yes,Yes,American,Other,
4,5,Working full time,Never married,0.0,35.0,16.0,,12.0,,Bachelor,Female,White,Yes,Neither in U.S.,4.0,$25000 or more,$25000 or more,Moderate,Favor,,Pretty happy,,No,,,Annual salary,French Canada,French Canada,Ireland,,Only some,Only some,Only some,Only some,Only some,Only some,35 to 44,,No,No,No,No,No,American,American,Korean


In [36]:
sample_survey.describe()

Unnamed: 0,id,childs,age,educ,paeduc,maeduc,speduc,granborn,tvhours
count,2832.0,2825.0,2828.0,2820.0,2041.0,2399.0,1311.0,2630.0,2337.0
mean,1416.5,1.821593,45.556931,13.250709,11.341009,11.459775,13.440122,1.109506,2.858365
std,817.672306,1.689838,17.100132,2.927512,4.185096,3.481026,2.859436,1.573659,2.246965
min,1.0,0.0,18.0,0.0,0.0,0.0,2.0,0.0,0.0
25%,708.75,0.0,32.0,12.0,8.0,10.0,12.0,0.0,1.0
50%,1416.5,2.0,42.0,13.0,12.0,12.0,13.0,0.0,2.0
75%,2124.25,3.0,57.0,16.0,14.0,13.0,16.0,2.0,4.0
max,2832.0,8.0,89.0,20.0,20.0,20.0,20.0,4.0,21.0


In [37]:
df=pd.crosstab(sample_survey.degree,sample_survey.marital)
df

marital,Divorced,Married,Never married,Separated,Widowed
degree,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bachelor,58,251,129,12,28
Graduate,29,123,41,3,9
High school,241,686,367,58,148
Junior college,45,108,46,3,6
LT High school,70,174,77,17,92


### The contingency table for carrying out the chi Squared test:

In [38]:
pd.DataFrame(stats.contingency.expected_freq(df),columns=df.columns,index=df.index)

marital,Divorced,Married,Never married,Separated,Widowed
degree,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bachelor,75.063453,227.393123,111.832683,15.758242,47.952499
Graduate,32.192485,97.522155,47.961716,6.758242,20.565402
High school,235.554768,713.576746,350.939383,49.450549,150.478554
Junior college,32.663594,98.949309,48.663594,6.857143,20.866359
LT High school,67.5257,204.558667,100.602623,14.175824,43.137185


In [39]:
stats.chi2_contingency(df)[1]

1.6707923432360119e-18

#### Cleary the the marital status is affected by the educational qualification

In [40]:
df=pd.crosstab(sample_survey.income,sample_survey.happy)
df

happy,Not too happy,Pretty happy,Very happy
income,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
$1000 TO 2999,7,20,5
$10000 - 14999,39,107,44
$15000 - 19999,33,119,26
$20000 - 24999,40,155,50
$25000 or more,113,888,571
$3000 TO 3999,9,11,4
$4000 TO 4999,9,13,10
$5000 TO 5999,6,18,11
$6000 TO 6999,14,13,6
$7000 TO 7999,12,21,14


In [41]:
pd.DataFrame(stats.contingency.expected_freq(df),columns=df.columns,index=df.index)

happy,Not too happy,Pretty happy,Very happy
income,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
$1000 TO 2999,3.895204,18.160419,9.944377
$10000 - 14999,23.127771,107.827489,59.04474
$15000 - 19999,21.66707,101.017332,55.315599
$20000 - 24999,29.822652,139.040709,76.136638
$25000 or more,191.351874,892.130593,488.517533
$3000 TO 3999,2.921403,13.620314,7.458283
$4000 TO 4999,3.895204,18.160419,9.944377
$5000 TO 5999,4.260379,19.862958,10.876663
$6000 TO 6999,4.016929,18.727932,10.255139
$7000 TO 7999,5.72108,26.673116,14.605804


In [42]:
stats.chi2_contingency(df)[1]

1.4107677273473057e-26

In [43]:
df=pd.crosstab(sample_survey.marital,sample_survey.happy)
df

happy,Not too happy,Pretty happy,Very happy
marital,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Divorced,72,278,93
Married,71,684,582
Never married,108,426,120
Separated,30,49,13
Widowed,59,137,83


In [44]:
pd.DataFrame(stats.contingency.expected_freq(df),columns=df.columns,index=df.index)

happy,Not too happy,Pretty happy,Very happy
marital,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Divorced,53.69697,248.585383,140.717647
Married,162.060606,750.245276,424.694118
Never married,79.272727,366.986096,207.741176
Separated,11.151515,51.624955,29.223529
Widowed,33.818182,156.558289,88.623529


In [45]:
stats.chi2_contingency(df)[1]

9.3147261197964e-52

#### Happiness is henceforth also an affect of income and marital status independently