# Inferential Statistics

In [16]:
import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import networkx as nx
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from imblearn.over_sampling import SMOTE
from collections import Counter
from scipy.stats import chi2_contingency
from statsmodels.stats.proportion import proportion_confint
import statsmodels.api as sm


file_path = '/Users/rakibul/Desktop/uni koblenz/winter 202324/Research Lab/code/models/dataset.csv'
dataset = pd.read_csv(file_path)

In [11]:
# Select the prognosis (e.g., "Lyme_disease")
selected_prognosis = "Lyme_disease"
prognosis_data = dataset[dataset['prognosis'] == selected_prognosis]
non_prognosis_data = dataset[dataset['prognosis'] != selected_prognosis]

In [12]:
# Symptoms columns
symptom_columns = dataset.columns[1:-1]  # Exclude 'id' and 'prognosis' columns

In [13]:
# Chi-Square Test for Independence
chi_square_results = {}
for symptom in symptom_columns:
    contingency_table = pd.crosstab(dataset[symptom], dataset['prognosis'] == selected_prognosis)
    chi2, p, _, _ = chi2_contingency(contingency_table)
    chi_square_results[symptom] = p

In [14]:
# Calculate Confidence Intervals for Symptoms Proportion
confidence_intervals = {}
n = len(prognosis_data)
for symptom in symptom_columns:
    count = prognosis_data[symptom].sum()
    lower, upper = proportion_confint(count, n, alpha=0.05, method='normal')
    confidence_intervals[symptom] = (lower, upper)

In [17]:
# Logistic Regression
X = dataset[symptom_columns]
y = (dataset['prognosis'] == selected_prognosis).astype(int)
logit_model = sm.Logit(y, sm.add_constant(X))
result = logit_model.fit()

Optimization terminated successfully.
         Current function value: 0.096957
         Iterations 11


In [18]:
# Display results
chi_square_results_df = pd.DataFrame.from_dict(chi_square_results, orient='index', columns=['p-value'])
confidence_intervals_df = pd.DataFrame.from_dict(confidence_intervals, orient='index', columns=['Lower CI', 'Upper CI'])
logit_summary = result.summary()

chi_square_results_df, confidence_intervals_df, logit_summary

(                     p-value
 sudden_fever    3.867413e-04
 headache        8.323053e-03
 mouth_bleed     1.295252e-02
 nose_bleed      8.534152e-03
 muscle_pain     4.568344e-01
 ...                      ...
 itchiness       1.000000e+00
 ulcers          4.025885e-02
 toenail_loss    7.905138e-01
 speech_problem  3.218323e-08
 bullseye_rash   1.127832e-08
 
 [64 rows x 1 columns],
                 Lower CI  Upper CI
 sudden_fever    0.632308  0.867692
 headache        0.503734  0.765496
 mouth_bleed     0.503734  0.765496
 nose_bleed      0.545579  0.800574
 muscle_pain     0.442642  0.711204
 ...                  ...       ...
 itchiness       0.055781  0.251911
 ulcers          0.000000  0.090730
 toenail_loss    0.028549  0.202220
 speech_problem  0.070252  0.275902
 bullseye_rash   0.070252  0.275902
 
 [64 rows x 2 columns],
 <class 'statsmodels.iolib.summary.Summary'>
 """
                            Logit Regression Results                           
 Dep. Variable:           