In [8]:
import numpy as np
from scipy import stats
import pandas as pd


In [3]:
# Generate some sample data
data = np.random.normal(loc=0, scale=1, size=1000)

# Basic statistics
mean = np.mean(data)
median = np.median(data)
std_dev = np.std(data)
variance = np.var(data)

print("Basic Statistics:")
print("Mean:", mean)
print("Median:", median)
print("Standard Deviation:", std_dev)
print("Variance:", variance)

Basic Statistics:
Mean: 0.003551381058324154
Median: 0.02092422023469755
Standard Deviation: 1.002257798850635
Variance: 1.00452069535692


In [4]:
# Hypothesis testing (t-test)
# Example: testing whether the mean of data is significantly different from 0
t_statistic, p_value = stats.ttest_1samp(data, 0)
print("\nHypothesis Testing:")
print("T-statistic:", t_statistic)
print("P-value:", p_value)


Hypothesis Testing:
T-statistic: 0.11199550021317141
P-value: 0.9108494727539143


In [5]:
# Correlation
# Example: calculating Pearson correlation coefficient between two sets of data
x = np.random.normal(loc=0, scale=1, size=100)
y = np.random.normal(loc=0, scale=1, size=100)
correlation_coef, p_value = stats.pearsonr(x, y)
print("\nCorrelation:")
print("Pearson correlation coefficient:", correlation_coef)
print("P-value:", p_value)



Correlation:
Pearson correlation coefficient: 0.14002932159007497
P-value: 0.16466717949271303


In [6]:
# Linear regression
# Example: fitting a linear regression model to some data
x = np.random.rand(100) * 10
y = 2 * x + np.random.normal(loc=0, scale=1, size=100)
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
print("\nLinear Regression:")
print("Slope:", slope)
print("Intercept:", intercept)
print("R-squared:", r_value ** 2)
print("P-value:", p_value)


Linear Regression:
Slope: 1.971391183376995
Intercept: 0.09909919762495356
R-squared: 0.971461953361822
P-value: 1.6872004664056094e-77


In [40]:
# Mann-Whitney U Test:
from scipy.stats import mannwhitneyu

# Example data
group1 = [1, 2, 3, 4, 5]
group2 = [6, 7, 8, 9, 10]

# Perform Mann-Whitney U test
statistic, p_value = mannwhitneyu(group1, group2)

print("Mann-Whitney U Test:")
print("U Statistic:", statistic)
print("P-value:", p_value)


Mann-Whitney U Test:
U Statistic: 0.0
P-value: 0.007936507936507936


In [41]:
# Kruskal-Wallis Test:
from scipy.stats import kruskal

# Example data
group1 = [1, 2, 3, 4, 5]
group2 = [6, 7, 8, 9, 10]
group3 = [11, 12, 13, 14, 15]

# Perform Kruskal-Wallis test
statistic, p_value = kruskal(group1, group2, group3)

print("Kruskal-Wallis Test:")
print("H Statistic:", statistic)
print("P-value:", p_value)


Kruskal-Wallis Test:
H Statistic: 12.5
P-value: 0.0019304541362277095


In [42]:
# Wilcoxon Signed-Rank Test:
from scipy.stats import wilcoxon

# Example data
before = [1, 2, 3, 4, 5]
after = [2, 3, 4, 5, 6]

# Perform Wilcoxon signed-rank test
statistic, p_value = wilcoxon(before, after)

print("Wilcoxon Signed-Rank Test:")
print("Test Statistic:", statistic)
print("P-value:", p_value)


Wilcoxon Signed-Rank Test:
Test Statistic: 0.0
P-value: 0.0625


In [43]:
# Pearson's Correlation:
from scipy.stats import pearsonr

# Example data
x = [1, 2, 3, 4, 5]
y = [2, 4, 6, 8, 10]

# Calculate Pearson correlation coefficient and p-value
correlation_coef, p_value = pearsonr(x, y)

print("Pearson's Correlation:")
print("Correlation Coefficient:", correlation_coef)
print("P-value:", p_value)


Pearson's Correlation:
Correlation Coefficient: 1.0
P-value: 0.0


In [44]:
# Spearman's Correlation:
from scipy.stats import spearmanr

# Example data
x = [1, 2, 3, 4, 5]
y = [2, 4, 6, 8, 10]

# Calculate Spearman correlation coefficient and p-value
correlation_coef, p_value = spearmanr(x, y)

print("Spearman's Correlation:")
print("Correlation Coefficient:", correlation_coef)
print("P-value:", p_value)


Spearman's Correlation:
Correlation Coefficient: 0.9999999999999999
P-value: 1.4042654220543672e-24


In [45]:
# One-Way ANOVA:
from scipy.stats import f_oneway

# Example data
group1 = [1, 2, 3, 4, 5]
group2 = [6, 7, 8, 9, 10]
group3 = [11, 12, 13, 14, 15]

# Perform one-way ANOVA
statistic, p_value = f_oneway(group1, group2, group3)

print("One-Way ANOVA:")
print("F Statistic:", statistic)
print("P-value:", p_value)


One-Way ANOVA:
F Statistic: 50.0
P-value: 1.5127924217375409e-06


In [50]:
# Two-Way ANOVA:
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Example data
data = sm.datasets.get_rdataset("ToothGrowth").data

# Perform two-way ANOVA
model = ols('len ~ C(supp) + C(dose) + C(supp):C(dose)', data=data).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

print("Two-Way ANOVA:")
print(anova_table)


Two-Way ANOVA:
                      sum_sq    df          F        PR(>F)
C(supp)           205.350000   1.0  15.571979  2.311828e-04
C(dose)          2426.434333   2.0  91.999965  4.046291e-18
C(supp):C(dose)   108.319000   2.0   4.106991  2.186027e-02
Residual          712.106000  54.0        NaN           NaN


In [49]:
# Repeated Measures ANOVA:
import statsmodels.api as sm
from statsmodels.formula.api import mixedlm

# Example data
data = sm.datasets.get_rdataset("dietox", "geepack").data

# Perform repeated measures ANOVA
model = mixedlm("Weight ~ Time", data, groups=data["Pig"], re_formula="~Time")
result = model.fit()

print("Repeated Measures ANOVA:")
print(result.summary())




Repeated Measures ANOVA:
           Mixed Linear Model Regression Results
Model:             MixedLM  Dependent Variable:  Weight    
No. Observations:  861      Method:              REML      
No. Groups:        72       Scale:               5.7891    
Min. group size:   11       Log-Likelihood:      -2220.3890
Max. group size:   12       Converged:           No        
Mean group size:   12.0                                    
-----------------------------------------------------------
                 Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-----------------------------------------------------------
Intercept        15.739    0.672 23.438 0.000 14.423 17.055
Time              6.939    0.085 81.326 0.000  6.772  7.106
Group Var        30.266    4.271                           
Group x Time Cov  0.746    0.304                           
Time Var          0.483    0.046                           



In [51]:
# Mixed Model ANOVA:
import statsmodels.api as sm
from statsmodels.formula.api import mixedlm

# Example data
data = sm.datasets.get_rdataset("dietox", "geepack").data

# Perform mixed model ANOVA
model = mixedlm("Weight ~ Time + Diet", data, groups=data["Pig"], re_formula="~Time")
result = model.fit()

print("Mixed Model ANOVA:")
print(result.summary())


PatsyError: Error evaluating factor: NameError: name 'Diet' is not defined
    Weight ~ Time + Diet
                    ^^^^

In [52]:
# Logistic Regression:
import statsmodels.api as sm

# Example data
data = sm.datasets.get_rdataset("iris").data

# Perform logistic regression
model = sm.Logit.from_formula("Species=='setosa' ~ Sepal.Length + Sepal.Width", data)
result = model.fit()

print("Logistic Regression:")
print(result.summary())


PatsyError: Error evaluating factor: NameError: name 'Sepal' is not defined
    Species=='setosa' ~ Sepal.Length + Sepal.Width
                                       ^^^^^^^^^^^

In [53]:
# Multiple Linear Regression:
import statsmodels.api as sm

# Example data
data = sm.datasets.get_rdataset("mtcars").data

# Perform multiple linear regression
X = data[["mpg", "hp"]]
y = data["wt"]
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

print("Multiple Linear Regression:")
print(model.summary())


Multiple Linear Regression:
                            OLS Regression Results                            
Dep. Variable:                     wt   R-squared:                       0.753
Model:                            OLS   Adj. R-squared:                  0.736
Method:                 Least Squares   F-statistic:                     44.29
Date:                Thu, 16 May 2024   Prob (F-statistic):           1.53e-09
Time:                        16:09:04   Log-Likelihood:                -21.803
No. Observations:                  32   AIC:                             49.61
Df Residuals:                      29   BIC:                             54.00
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          6.2183   

In [54]:
# Paired Samples t-test:
from scipy.stats import ttest_rel

# Example data
before = [1, 2, 3, 4, 5]
after = [2, 3, 4, 5, 6]

# Perform paired samples t-test
t_statistic, p_value = ttest_rel(before, after)

print("Paired Samples t-test:")
print("T-statistic:", t_statistic)
print("P-value:", p_value)


Paired Samples t-test:
T-statistic: -inf
P-value: 0.0


  res = hypotest_fun_out(*samples, **kwds)


In [55]:
# Independent Samples t-test:
from scipy.stats import ttest_ind

# Example data
group1 = [1, 2, 3, 4, 5]
group2 = [6, 7, 8, 9, 10]

# Perform independent samples t-test
t_statistic, p_value = ttest_ind(group1, group2)

print("Independent Samples t-test:")
print("T-statistic:", t_statistic)
print("P-value:", p_value)


Independent Samples t-test:
T-statistic: -5.0
P-value: 0.001052825793366539


In [23]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [32]:
#t-test
t_statistic, p_value = stats.ttest_1samp(train.Age, 0)
print('T-test results')
print('T statistic',t_statistic)
print('P value', p_value)


T-test results
T statistic nan
P value nan


In [31]:
np.mean(train.Age)

29.69911764705882

In [39]:
import numpy as np
from scipy.stats import chi2_contingency

observed = np.array(train.groupby(['Sex', 'Survived'])[train.columns[1:]].count())
# Perform chi-square test
chi2_stat, p_val, dof, expected = chi2_contingency(observed)

# Print results
print("Chi-square statistic:", chi2_stat)
print("P-value:", p_val)
print("Degrees of Freedom:", dof)



Chi-square statistic: 68.74587220508465
P-value: 7.132361638233814e-05
Degrees of Freedom: 30


In [35]:
import numpy as np
from scipy.stats import chi2_contingency

# Example contingency table
observed = np.array([[10, 20, 30],
                     [6,  9,  17]])

# Perform chi-square test
chi2_stat, p_val, dof, expected = chi2_contingency(observed)

# Print results
print("Chi-square statistic:", chi2_stat)
print("P-value:", p_val)
print("Degrees of Freedom:", dof)
print("Expected frequencies table:")
print(expected)


Chi-square statistic: 0.27157465150403504
P-value: 0.873028283380073
Degrees of Freedom: 2
Expected frequencies table:
[[10.43478261 18.91304348 30.65217391]
 [ 5.56521739 10.08695652 16.34782609]]
