<a href="https://colab.research.google.com/github/sejallotliker/Machine_learning_practice/blob/main/To_carry_out_anova_for_different_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#ANOVA: Analysis of varience

In this notebook ANOVA is carried out for different datasets.

1. Crop dataset

2. Titenic dataset

In [107]:
#To import libraries

import numpy as np
import pandas as pd

from scipy.stats import f

df = pd.read_csv('/content/drive/MyDrive/JNCASR/Sem-4/Intro_to_ML/My_practice_ML/crops.csv')

In [108]:
df.head()

Unnamed: 0,crop_density,fertilizer_types,crop_yield
0,low,standard,177.228692
1,high,standard,177.550041
2,low,standard,176.408462
3,high,standard,177.703625
4,low,standard,177.125486


#ANOVA done manually

In [109]:
#To define a function to do one way anova manually:
def one_way_anova(df, group_col, value_col):
    overall_mean = df[value_col].mean()

    # Between-group sum of squares (SSB)
    ssb = df.groupby(group_col)[value_col].apply(lambda x: len(x) * (x.mean() - overall_mean)**2).sum()

    # Within-group sum of squares (SSW)
    ssw = df.groupby(group_col)[value_col].apply(lambda x: ((x - x.mean())**2).sum()).sum()

    # Degrees of freedom
    df_between = df[group_col].nunique() - 1
    df_within = len(df) - df[group_col].nunique()

    # Mean squares
    ms_between = ssb / df_between
    ms_within = ssw / df_within

    # F-statistic
    f_stat = ms_between / ms_within

    # p-value
    p_value = 1 - f.cdf(f_stat, df_between, df_within)

    return {
        'SSB': ssb,
        'SSW': ssw,
        'DF_between': df_between,
        'DF_within': df_within,
        'MS_between': ms_between,
        'MS_within': ms_within,
        'F-statistic': f_stat,
        'p-value': p_value
    }


In [110]:
# Apply One-Way ANOVA for crop_density and fertilizer_types separately
anova_density = one_way_anova(df = df,
                              group_col = 'crop_density',
                              value_col = 'crop_yield')
anova_fertilizer = one_way_anova(df = df,
                                 group_col = 'fertilizer_types',
                                 value_col = 'crop_yield')

# Compile results into a DataFrame
anova_results_one_way_manual = pd.DataFrame({
    'Source': ['Crop Density', 'Fertilizer Type'],
    'SSB': [anova_density['SSB'], anova_fertilizer['SSB']],
    'SSW': [anova_density['SSW'], anova_fertilizer['SSW']],
    'DF_between': [anova_density['DF_between'], anova_fertilizer['DF_between']],
    'DF_within': [anova_density['DF_within'], anova_fertilizer['DF_within']],
    'MS_between': [anova_density['MS_between'], anova_fertilizer['MS_between']],
    'MS_within': [anova_density['MS_within'], anova_fertilizer['MS_within']],
    'F-statistic': [anova_density['F-statistic'], anova_fertilizer['F-statistic']],
    'p-value': [anova_density['p-value'], anova_fertilizer['p-value']]
})

In [111]:
anova_results_one_way_manual

Unnamed: 0,Source,SSB,SSW,DF_between,DF_within,MS_between,MS_within,F-statistic,p-value
0,Crop Density,5.121681,36.832552,1,94,5.121681,0.391836,13.070994,0.000485
1,Fertilizer Type,6.068047,35.886186,2,93,3.034023,0.385873,7.862752,0.0007


#ANOVA done using a package

In [112]:
# Import necessary libraries for ANOVA using a package
import statsmodels.api as sm
from statsmodels.formula.api import ols

# One-Way ANOVA for Crop Density
model_density = ols('crop_yield ~ C(crop_density)', data=df).fit()
anova_density_pkg = sm.stats.anova_lm(model_density, typ=2)

# One-Way ANOVA for Fertilizer Type
model_fertilizer = ols('crop_yield ~ C(fertilizer_types)', data=df).fit()
anova_fertilizer_pkg = sm.stats.anova_lm(model_fertilizer, typ=2)

# Combine the results into a single DataFrame
anova_results_one_way_pkg = pd.DataFrame({
    'Source': ['Crop Density', 'Fertilizer Type'],
    'SSB': [anova_density_pkg['sum_sq'][0], anova_fertilizer_pkg['sum_sq'][0]],
    'SSW': [anova_density_pkg['sum_sq'][1], anova_fertilizer_pkg['sum_sq'][1]],
    'DF_between': [anova_density_pkg['df'][0], anova_fertilizer_pkg['df'][0]],
    'DF_within': [anova_density_pkg['df'][1], anova_fertilizer_pkg['df'][1]],
    'F-statistic': [anova_density_pkg['F'][0], anova_fertilizer_pkg['F'][0]],
    'p-value': [anova_density_pkg['PR(>F)'][0], anova_fertilizer_pkg['PR(>F)'][0]]
})


  'SSB': [anova_density_pkg['sum_sq'][0], anova_fertilizer_pkg['sum_sq'][0]],
  'SSW': [anova_density_pkg['sum_sq'][1], anova_fertilizer_pkg['sum_sq'][1]],
  'DF_between': [anova_density_pkg['df'][0], anova_fertilizer_pkg['df'][0]],
  'DF_within': [anova_density_pkg['df'][1], anova_fertilizer_pkg['df'][1]],
  'F-statistic': [anova_density_pkg['F'][0], anova_fertilizer_pkg['F'][0]],
  'p-value': [anova_density_pkg['PR(>F)'][0], anova_fertilizer_pkg['PR(>F)'][0]]


In [113]:
anova_results_one_way_pkg

Unnamed: 0,Source,SSB,SSW,DF_between,DF_within,F-statistic,p-value
0,Crop Density,5.121681,36.832552,1.0,94.0,13.070994,0.000485
1,Fertilizer Type,6.068047,35.886186,2.0,93.0,7.862752,0.0007


#Titenic dataset

In [114]:
#For titenic data:

import seaborn as sns
df = sns.load_dataset('titanic')

In [115]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [116]:
features = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
X = df[features]

y = df['survived']

#X.head()

In [117]:
X.isnull().sum()

Unnamed: 0,0
pclass,0
sex,0
age,177
sibsp,0
parch,0
fare,0
embarked,2


In [118]:
#To remove null values from the dataset:

columns_numerical = X.select_dtypes(include=[np.number]).columns.to_list()
columns_nonnumerical = X.select_dtypes(exclude=[np.number]).columns.to_list()
columns_numerical, columns_nonnumerical

(['pclass', 'age', 'sibsp', 'parch', 'fare'], ['sex', 'embarked'])

In [119]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
X[columns_numerical] = imputer.fit_transform(X[columns_numerical])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[columns_numerical] = imputer.fit_transform(X[columns_numerical])


In [120]:
X[columns_numerical].isnull().sum()

Unnamed: 0,0
pclass,0
age,0
sibsp,0
parch,0
fare,0


In [121]:
non_numerical_imputer = SimpleImputer(strategy='most_frequent')
X[columns_nonnumerical] = non_numerical_imputer.fit_transform(X[columns_nonnumerical])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[columns_nonnumerical] = non_numerical_imputer.fit_transform(X[columns_nonnumerical])


In [122]:
X[columns_nonnumerical].isnull().sum()

Unnamed: 0,0
sex,0
embarked,0


In [123]:
X.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
0,3.0,male,22.0,1.0,0.0,7.25,S
1,1.0,female,38.0,1.0,0.0,71.2833,C
2,3.0,female,26.0,0.0,0.0,7.925,S
3,1.0,female,35.0,1.0,0.0,53.1,S
4,3.0,male,35.0,0.0,0.0,8.05,S


In [124]:
y.head()

Unnamed: 0,survived
0,0
1,1
2,1
3,1
4,0


In [134]:
df = pd.concat([X, y],  axis=1)


In [135]:
df.columns

Index(['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked',
       'survived'],
      dtype='object')

###All null values are removed from the data

#ANOVA on titenic dataset

In [151]:
# Import necessary libraries for ANOVA using a package
import statsmodels.api as sm
from statsmodels.formula.api import ols

# One-Way ANOVA for pclass
model_pclass = ols('survived ~ C(pclass)', data=df).fit()
anova_pclass_pkg = sm.stats.anova_lm(model_density, typ=2)

# One-Way ANOVA for sex
model_sex = ols('survived ~ C(sex)', data=df).fit()
anova_sex_pkg = sm.stats.anova_lm(model_fertilizer, typ=2)

# One-Way ANOVA for age
model_ager = ols('survived ~ C(age)', data=df).fit()
anova_age_pkg = sm.stats.anova_lm(model_fertilizer, typ=2)

# One-Way ANOVA for sibsp
model_sibsp = ols('survived ~ C(sibsp)', data=df).fit()
anova_sibsp_pkg = sm.stats.anova_lm(model_fertilizer, typ=2)

# One-Way ANOVA for parch
model_parch = ols('survived ~ C(parch)', data=df).fit()
anova_parch_pkg = sm.stats.anova_lm(model_fertilizer, typ=2)

# One-Way ANOVA for fare
model_fare = ols('survived ~ C(fare)', data=df).fit()
anova_fare_pkg = sm.stats.anova_lm(model_fertilizer, typ=2)

# One-Way ANOVA for embarked
model_embarked = ols('survived ~ C(embarked)', data=df).fit()
anova_embarked_pkg = sm.stats.anova_lm(model_fertilizer, typ=2)

# Combine the results into a single DataFrame
anova_results_one_way_pkg = pd.DataFrame({
    'Source': ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked'],
    'SSB': [anova_pclass_pkg['sum_sq'][0], anova_sex_pkg['sum_sq'][0], anova_age_pkg['sum_sq'][0], anova_sibsp_pkg['sum_sq'][0],
            anova_parch_pkg['sum_sq'][0], anova_fare_pkg['sum_sq'][0], anova_embarked_pkg['sum_sq'][0]],
    'SSW': [anova_pclass_pkg['sum_sq'][1], anova_sex_pkg['sum_sq'][1], anova_age_pkg['sum_sq'][1], anova_sibsp_pkg['sum_sq'][1],
            anova_parch_pkg['sum_sq'][1], anova_fare_pkg['sum_sq'][1], anova_embarked_pkg['sum_sq'][1]],
   'DF_between': [anova_pclass_pkg['df'][0], anova_sex_pkg['df'][0], anova_age_pkg['df'][0], anova_sibsp_pkg['df'][0],
            anova_parch_pkg['df'][0], anova_fare_pkg['df'][0], anova_embarked_pkg['df'][0]],
  'DF_within':  [anova_pclass_pkg['df'][1], anova_sex_pkg['df'][1], anova_age_pkg['df'][1], anova_sibsp_pkg['df'][1],
            anova_parch_pkg['df'][1], anova_fare_pkg['df'][1], anova_embarked_pkg['df'][1]],
   'F-statistic': [anova_pclass_pkg['F'][0], anova_sex_pkg['F'][0], anova_age_pkg['F'][0], anova_sibsp_pkg['F'][0],
            anova_parch_pkg['F'][0], anova_fare_pkg['F'][0], anova_embarked_pkg['F'][0]],
  'p-value': [anova_pclass_pkg['PR(>F)'][0], anova_sex_pkg['PR(>F)'][0], anova_age_pkg['PR(>F)'][0], anova_sibsp_pkg['PR(>F)'][0],
            anova_parch_pkg['PR(>F)'][0], anova_fare_pkg['PR(>F)'][0], anova_embarked_pkg['PR(>F)'][0]]
})

  'SSB': [anova_pclass_pkg['sum_sq'][0], anova_sex_pkg['sum_sq'][0], anova_age_pkg['sum_sq'][0], anova_sibsp_pkg['sum_sq'][0],
  anova_parch_pkg['sum_sq'][0], anova_fare_pkg['sum_sq'][0], anova_embarked_pkg['sum_sq'][0]],
  'SSW': [anova_pclass_pkg['sum_sq'][1], anova_sex_pkg['sum_sq'][1], anova_age_pkg['sum_sq'][1], anova_sibsp_pkg['sum_sq'][1],
  anova_parch_pkg['sum_sq'][1], anova_fare_pkg['sum_sq'][1], anova_embarked_pkg['sum_sq'][1]],
  'DF_between': [anova_pclass_pkg['df'][0], anova_sex_pkg['df'][0], anova_age_pkg['df'][0], anova_sibsp_pkg['df'][0],
  anova_parch_pkg['df'][0], anova_fare_pkg['df'][0], anova_embarked_pkg['df'][0]],
  'DF_within':  [anova_pclass_pkg['df'][1], anova_sex_pkg['df'][1], anova_age_pkg['df'][1], anova_sibsp_pkg['df'][1],
  anova_parch_pkg['df'][1], anova_fare_pkg['df'][1], anova_embarked_pkg['df'][1]],
  'F-statistic': [anova_pclass_pkg['F'][0], anova_sex_pkg['F'][0], anova_age_pkg['F'][0], anova_sibsp_pkg['F'][0],
  anova_parch_pkg['F'][0], anova_fare_p

#Results of ANOVA

In [152]:
anova_results_one_way_pkg

Unnamed: 0,Source,SSB,SSW,DF_between,DF_within,F-statistic,p-value
0,pclass,24.333912,186.39336,2.0,888.0,57.964818,2.183247e-24
1,sex,6.140761,204.586511,2.0,888.0,13.326871,1.983239e-06
2,age,6.140761,204.586511,2.0,888.0,13.326871,1.983239e-06
3,sibsp,6.140761,204.586511,2.0,888.0,13.326871,1.983239e-06
4,parch,6.140761,204.586511,2.0,888.0,13.326871,1.983239e-06
5,fare,6.140761,204.586511,2.0,888.0,13.326871,1.983239e-06
6,embarked,6.140761,204.586511,2.0,888.0,13.326871,1.983239e-06
