In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler
from statsmodels.graphics.correlation import plot_corr
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
# df = pd.read_csv("data.csv")

# Print the first 10 rows of the data
print(df.head(10))

# Get a summary of the data
print(df.describe())

# Check for missing values
print("Missing values before handling:\n", df.isnull().sum())

# Check the number of unique values
print("Number of unique values:\n", df.nunique())

# Check the value counts for each column
for col in df.columns:
    print(f'Value counts for {col}:\n', df[col].value_counts())


# Handle missing values by filling with mean
df = df.fillna(df.mean())

# Check for missing values again
print("Missing values after handling:\n", df.isnull().sum())

# Check the data types
print("Data types:\n", df.dtypes)

# If any categorical variables, convert them into dummy variables
for col in df.columns:
    if df[col].dtype == 'object':
        df = pd.concat(
            [df.drop(col, axis=1), pd.get_dummies(df[col], prefix=col)], axis=1)

# Check the data types again
print("Data types after handling categorical variables:\n", df.dtypes)

# Standardize the data
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# Check the correlation matrix and visualize it
corr = df_scaled.corr()
print("Correlation matrix:\n", corr)

# Plotting a heatmap using seaborn
plt.figure(figsize=(10, 10))
sns.heatmap(corr, vmin=-1, vmax=1, annot=True, cmap='BrBG')
plt.title("Correlation Heatmap")
plt.show()

# Print Skewness and Kurtosis
for col in df.columns:
    print(f'Skewness of {col}: {df[col].skew()}')
    print(f'Kurtosis of {col}: {df[col].kurt()}')

# Histograms
for col in df.columns:
    plt.figure()
    plt.hist(df[col], bins=50)
    plt.title(f'Histogram of {col}')
    plt.show()

# Boxplots
for col in df.columns:
    plt.figure()
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

# Scatterplots
for i in range(len(df.columns)):
    for j in range(i+1, len(df.columns)):
        plt.figure()
        plt.scatter(df[df.columns[i]], df[df.columns[j]])
        plt.title(f'Scatterplot of {df.columns[j]} vs {df.columns[i]}')
        plt.xlabel(df.columns[i])
        plt.ylabel(df.columns[j])
        plt.show()

# T-tests
for col in df.columns:
    t_stat, p_val = stats.ttest_1samp(df[col], 0)
    print(
        f'T-test for {col}: t statistic = {t_stat:.3f}, p-value = {p_val:.3f}')


In [None]:
# Part 2
# Data Imputation: Replacing missing values using SimpleImputer
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA
import statsmodels.api as sm
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# One-Hot Encoding for Categorical variables
df_encoded = pd.get_dummies(df_imputed, drop_first=True)

# Pairwise correlations of different categories.
print("Pairwise correlation of different categories:")
pairwise_corr = df_encoded.corr().abs()
print(pairwise_corr)

# Outlier detection using Z-score
z_scores = np.abs(stats.zscore(df_encoded))
df_outliers_removed = df_encoded[(z_scores < 3).all(axis=1)]

# Multiple Scatter Plots using seaborn's pairplot
sns.pairplot(df_outliers_removed)
plt.show()

# Chi-Square Test for Categorical variables
chi2_stat, p_val, dof, ex = stats.chi2_contingency(df_encoded)
print(f'Chi-Square Statistic : {chi2_stat}\n')
print(f'p-value : {p_val}\n')

# Linear Regression model fitting and summary using statsmodels
X = sm.add_constant(df_encoded.iloc[:, :-1])  # adding a constant
Y = df_encoded.iloc[:, -1]
model = sm.OLS(Y, X).fit()
predictions = model.predict(X)
print(model.summary())

# PCA for dimensionality reduction and visualization
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(df_encoded)
principalDf = pd.DataFrame(data=principalComponents, columns=['PC1', 'PC2'])
sns.scatterplot(data=principalDf, x='PC1', y='PC2')
plt.show()

# Calculating VIF to check for multicollinearity
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(
    X.values, i) for i in range(len(X.columns))]
print(vif_data)

# Plotting distribution of each variable
for col in df_encoded.columns:
    sns.distplot(df_encoded[col])
    plt.title(f'Distribution of {col}')
    plt.show()

# Grouping and aggregating data
grouped_data = df_encoded.groupby(df_encoded.columns[0]).mean()
print("Grouped and aggregated data:\n", grouped_data)

# Calculating pairwise differences between each pair of columns
for i in range(len(df_encoded.columns)):
    for j in range(i + 1, len(df_encoded.columns)):
        diff = df_encoded[df_encoded.columns[i]] - \
            df_encoded[df_encoded.columns[j]]
        print(
            f'Difference between {df_encoded.columns[i]} and {df_encoded.columns[j]}: {diff}')

# Bivariate Analysis
for i in range(len(df_encoded.columns)):
    for j in range(i + 1, len(df_encoded.columns)):
        sns.jointplot(x=df_encoded[df_encoded.columns[i]],
                      y=df_encoded[df_encoded.columns[j]])
        plt.show()


In [None]:
# Part 3
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Assume the last column is the target variable and is categorical
X = df_encoded.iloc[:, :-1]
y = df_encoded.iloc[:, -1]

# Convert target variable to categorical (one-hot encoding)
encoder = LabelEncoder()
encoder.fit(y)
encoded_y = encoder.transform(y)
dummy_y = np_utils.to_categorical(encoded_y)

# Split the dataset into the training set and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, dummy_y, test_size=0.2, random_state=0)

# Define a neural network model
model = Sequential()
model.add(Dense(32, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(16, activation='relu'))
# Multi-class classification
model.add(Dense(y_train.shape[1], activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

# Fit the model
history = model.fit(X_train, y_train, validation_data=(
    X_test, y_test), epochs=50, batch_size=10)

# Evaluate the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()


In [None]:
# Part 4
import seaborn as sns

# Let's consider df_encoded for this visualisation
df_vis = df_encoded.copy()

# 1. Heatmap for correlation
plt.figure(figsize=(10, 8))
sns.heatmap(df_vis.corr(), cmap='coolwarm')
plt.title("Heatmap of Correlation")
plt.show()

# 2. Pairplot - plots pairwise relationships in a dataset
sns.pairplot(df_vis.sample(n=100))  # Using a sample for efficiency
plt.title("Pairplot of Variables")
plt.show()

# 3. Histogram for each column
for col in df_vis.columns:
    sns.histplot(data=df_vis, x=col, kde=True)
    plt.title(f'Histogram of {col}')
    plt.show()

# 4. Boxplot for each column
for col in df_vis.columns:
    sns.boxplot(x=df_vis[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

# 5. Violinplot for each column
for col in df_vis.columns:
    sns.violinplot(x=df_vis[col])
    plt.title(f'Violin Plot of {col}')
    plt.show()

# 6. Scatter plot for first two columns
sns.scatterplot(data=df_vis, x=df_vis.columns[0], y=df_vis.columns[1])
plt.title("Scatterplot")
plt.show()

# 7. Countplot for first column
sns.countplot(x=df_vis[df_vis.columns[0]])
plt.title("Count Plot")
plt.show()

# 8. Barplot - For this, let's consider first two columns
sns.barplot(x=df_vis.columns[0], y=df_vis.columns[1], data=df_vis)
plt.title("Bar Plot")
plt.show()

# 9. Distribution plot for first column
sns.displot(df_vis, x=df_vis.columns[0], kde=True)
plt.title("Distribution Plot")
plt.show()

# 10. Joint plot for first two columns
sns.jointplot(data=df_vis, x=df_vis.columns[0], y=df_vis.columns[1])
plt.title("Joint Plot")
plt.show()


In [None]:
import pandas as pd
from scipy import stats

# Let's assume we have two dataframes: df1 and df2

# Load your data
df1 = pd.read_csv('dataset1.csv')
df2 = pd.read_csv('dataset2.csv')

# Let's say we're interested in comparing a particular column from both dataframes
col1 = df1['YourColumn']
col2 = df2['YourColumn']

# Perform ANOVA
f_stat, p_val = stats.f_oneway(col1, col2)

print("F-statistic:", f_stat)
print("P-value:", p_val)

# Interpretation:
if p_val < 0.05:
    print("There is a significant difference between the means of the two datasets.")
else:
    print("There is no significant difference between the means of the two datasets.")


In [None]:
# Perform T-test
t_stat, p_val_t = stats.ttest_ind(col1, col2)

print("T-statistic:", t_stat)
print("P-value:", p_val_t)

# Interpretation
if p_val_t < 0.05:
    print("There is a significant difference between the means of the two datasets (T-test).")
else:
    print("There is no significant difference between the means of the two datasets (T-test).")


In [None]:
# Let's assume that the variables 'categorical_var1' and 'categorical_var2' are categorical variables

# Perform Chi-square test
chi_stat, p_val_chi, dof, ex = stats.chi2_contingency(
    pd.crosstab(df1['categorical_var1'], df2['categorical_var2']))

print("Chi-square statistic:", chi_stat)
print("P-value:", p_val_chi)

# Interpretation
if p_val_chi < 0.05:
    print("There is a significant relationship between the two categorical variables.")
else:
    print("There is no significant relationship between the two categorical variables.")


In [None]:
# Perform Mann-Whitney U Test
u_stat, p_val_u = stats.mannwhitneyu(col1, col2)

print("U statistic:", u_stat)
print("P-value:", p_val_u)

# Interpretation
if p_val_u < 0.05:
    print("There is a significant difference between the distributions of the two datasets.")
else:
    print("There is no significant difference between the distributions of the two datasets.")


In [None]:
from statsmodels.multivariate.manova import MANOVA
# Assuming that df1 and df2 are combined into one dataframe and group labels are stored in df['group']
manova = MANOVA.from_formula('col1 + col2 + col3 ~ group', data=df)
print(manova.mv_test())


In [None]:
"""
Kruskal-Wallis H-test: This is the non-parametric version of ANOVA, used when the assumptions of ANOVA are not met (e.g., when the dependent variable is not normally distributed).
"""

from scipy.stats import kruskal
stat, p = kruskal(df1["column"], df2["column"])
print('Statistics=%.3f, p=%.3f' % (stat, p))


In [None]:
"""
Two-way ANOVA: You would use two-way ANOVA when you want to know how two independent variables, in combination, affect a dependent variable.
"""

from statsmodels.stats.anova import AnovaRM
# Assuming that df1 and df2 are combined into one dataframe, with a "treatment" column indicating the group
res = AnovaRM(df, 'dependent_var', 'subject_id', within=[
              'independent_var1', 'independent_var2'], aggregate_func='mean').fit()
print(res)


In [None]:
"""
Generalized Linear Models (GLM): These models extend the ordinary linear regression models to allow for response variables that have error distribution models other than a normal distribution.
"""

import statsmodels.api as sm
glm_binom = sm.GLM(y, X, family=sm.families.Binomial())
res = glm_binom.fit()
print(res.summary())


In [None]:
"""
Time Series Analysis (TSA): If you are dealing with time-series data, TSA tests can be helpful. This includes autocorrelation, partial autocorrelation, and Dickey-Fuller tests among others.
"""
from statsmodels.tsa.stattools import adfuller
result = adfuller(df['column'])
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])


In [None]:
"""
Survival Analysis: This set of statistical approaches is used to analyze the time until the occurrence of an event. Libraries like lifelines in Python provide functionalities for this.
"""

from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()
kmf.fit(T, E)
kmf.plot_survival_function()


In [None]:
import statsmodels.api as sm
from scipy.stats import ttest_ind, f_oneway, kruskal
from statsmodels.multivariate.manova import MANOVA
from statsmodels.formula.api import ols
from statsmodels.tsa.stattools import adfuller
from lifelines import KaplanMeierFitter

# T-Test
# This test is used to determine if the means of two sets of data (arr1 and arr2) are significantly different from each other.
# arr1, arr2: Arrays (or lists) containing the numerical data of two independent groups
# Output: t_statistic (the calculated t-statistic), p_value (two-tailed p-value)
t_statistic, p_value = ttest_ind(arr1, arr2)
print(f"T-test:\nT-statistic: {t_statistic}, P-value: {p_value}\n")

# One-way ANOVA
# This test is used to determine if there are any statistically significant differences between the means of three or more independent groups (arr1, arr2, and arr3).
# arr1, arr2, arr3: Arrays (or lists) containing the numerical data of three or more independent groups
# Output: f_statistic (the calculated f-statistic), p_value (two-tailed p-value)
f_statistic, p_value = f_oneway(arr1, arr2, arr3)
print(f"ANOVA:\nF-statistic: {f_statistic}, P-value: {p_value}\n")

# MANOVA
# This test extends ANOVA for cases where there are two or more dependent variables (num1 and num2), it is used to test if category (cat) has different effects on the dependent variables.
# df: DataFrame containing the data
# num1, num2: Column names of the numerical dependent variables
# cat: Column name of the categorical independent variable
# Output: Pillai's trace, Wilks' lambda, Hotelling-Lawley trace, and Roy's greatest root statistics and associated p-values
mv = MANOVA.from_formula('num1 + num2 ~ cat', data=df)
print(f"MANOVA:\n{mv.mv_test()}\n")

# Kruskal-Wallis H-test
# This test is used to determine if there are statistically significant differences between two or more groups of an independent variable on a continuous or ordinal dependent variable (arr1, arr2, and arr3).
# arr1, arr2, arr3: Arrays (or lists) containing the numerical data of three or more independent groups
# Output: h_statistic (the calculated Kruskal-Wallis H statistic), p_value (two-tailed p-value)
h_statistic, p_value = kruskal(arr1, arr2, arr3)
print(
    f"Kruskal-Wallis H-test:\nH-statistic: {h_statistic}, P-value: {p_value}\n")

# Two-way ANOVA
# This test is used to determine how two different categorical variables (cat1 and cat2) interact to affect a dependent variable (num).
# df: DataFrame containing the data
# num: Column name of the numerical dependent variable
# cat1, cat2: Column names of the categorical independent variables
# Output: Summary table containing sum of squares, mean square, f-statistic, and p-value for each factor and interaction
model = ols('num ~ C(cat1) + C(cat2) + C(cat1):C(cat2)', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(f"Two-way ANOVA:\n{anova_table}\n")

# Generalized Linear Models (GLM)
# GLM is a flexible generalization of ordinary linear regression that allows for response variables that have other than a normal distribution, it relates a function of the expected response variable y to a linear combination of predictors X.
# y: Array (or list) containing the numerical dependent variable
# X: 2D array (or DataFrame) containing the numerical independent variables
# Output: Summary table containing coefficients, standard errors, z-statistic, and p-value for each predictor
model = sm.GLM(y, X, family=sm.families.Binomial())
result = model.fit()
print(f"GLM:\n{result.summary()}\n")

# Time Series Analysis (TSA)
# The Augmented Dickey–Fuller (ADF) test is a type of statistical test called a unit root test, it tests if the time series (ts) is stationary (mean-reverting) or not.
# ts: Array (or list) containing the time series data
# Output: ADF statistic, p-value, used lag, number of observations, critical values for the ADF statistic, and the maximized information criterion
result = adfuller(ts)
print(f"ADF test:\nADF Statistic: {result[0]}, P-value: {result[1]}\n")

# Survival Analysis
# Kaplan–Meier estimator, also known as the product limit estimator, is a non-parametric statistic used to estimate the survival function from lifetime data (T and E), it is used to measure the fraction of subjects living for a certain amount of time after treatment.
# T: Array (or list) containing the time-to-event
# E: Array (or list) containing the event indicator (1=event, 0=censor)
# Output: Kaplan-Meier survival function plot
kmf = KaplanMeierFitter()
kmf.fit(T, event_observed=E)
kmf.plot_survival_function()
