In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

In [None]:

yap_df = pd.read_csv("/Users/seanlavi/dev/Schizophrenic_Speech/data/morphological_yap_features.csv", index_col = False)
yap_df

In [None]:
yap_df = yap_df[["attention_scores_per_sentence", "person", "label", "question"]]
yap_df = yap_df[yap_df["attention_scores_per_sentence"] != "[]"]
yap_df.head()

In [None]:
yap_df['attention_scores_per_sentence'] = yap_df['attention_scores_per_sentence'].apply(eval)

# Compute mean attention score per row
yap_df['mean_attention_score'] = yap_df['attention_scores_per_sentence'].apply(np.mean)
yap_df["var_attention_score"] = yap_df['attention_scores_per_sentence'].apply(np.var)
yap_df['attention_min'] = yap_df['attention_scores_per_sentence'].apply(np.min)

def calculate_weighted_average(scores):
    weights = np.exp(np.arange(1, len(scores) + 1))  # Exponential weights
    weighted_avg = np.average(scores, weights=weights)
    return weighted_avg


yap_df['weighted_avg_attention'] = yap_df['attention_scores_per_sentence'].apply(calculate_weighted_average)
yap_df.head(10)

In [None]:
# Calculate the size of each list in 'attention_scores_per_sentence'
yap_df['list_size'] = yap_df['attention_scores_per_sentence'].apply(len)

# Plot histogram of the sizes of the lists
plt.figure(figsize=(8, 6))
plt.hist(yap_df['list_size'], bins=30, color='skyblue', edgecolor='black')
plt.title('Histogram of List Sizes in attention_scores_per_sentence')
plt.xlabel('Size of List')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.75)
plt.show()


In [None]:
# Additional grid for histograms
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

for i, metric in enumerate(metrics):
    # Histogram
    sns.histplot(yap_df[metric], kde=True, ax=axes[i])
    axes[i].set_title(f'Histogram of {metric}')
    axes[i].set_xlabel(metric)
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:

# Define the metrics to plot
metrics = ['mean_attention_score', 'var_attention_score', 'attention_min', 'weighted_avg_attention']

# Set up the grid for plotting
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

# Loop through the metrics and create a boxplot and histogram for each
for i, metric in enumerate(metrics):
    # Boxplot
    sns.boxplot(x='label', y=metric, data=yap_df, ax=axes[i])
    axes[i].set_title(f'Boxplot of {metric}')
    axes[i].set_xlabel('Label')
    axes[i].set_ylabel(metric)

plt.tight_layout()
plt.show()


In [None]:
regular_group = yap_df[yap_df['label'] == 0]
schizo_group = yap_df[yap_df['label'] == 1]

# Create subplots
fig, axes = plt.subplots(len(metrics), 1, figsize=(10, 6 * len(metrics)))

# Plot histograms for each metric
for i, metric in enumerate(metrics):
    sns.histplot(regular_group[metric], color='blue', label='Regular', kde=True, ax=axes[i])
    sns.histplot(schizo_group[metric], color='red', label='Schizophrenic', kde=True, ax=axes[i])
    axes[i].set_title(f'Distribution of {metric.replace("_", " ").title()}')
    axes[i].set_xlabel(metric.replace("_", " ").title())
    axes[i].set_ylabel('Frequency')
    axes[i].legend()

plt.tight_layout()
plt.show()

In [None]:

# Define a function to perform t-tests and print results for each feature
def perform_t_tests(features, schizo_group, regular_group):
    for feature in features:
        t_stat, p_value = ttest_ind(schizo_group[feature], regular_group[feature])

        print(f"\nT-Test Results for {feature}:")
        print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}")

        # Interpret results
        if p_value < 0.05:
            print(f"There is a statistically significant difference in {feature} between schizophrenic and regular persons.")
        else:
            print(f"There is no statistically significant difference in {feature} between schizophrenic and regular persons.")

# List of features to test
features_to_test = ['mean_attention_score', 'attention_min', 'weighted_avg_attention'] # var is not noramlly distributed, invalid for t-test

# Perform t-tests for all features
perform_t_tests(features_to_test, schizo_group, regular_group)


In [None]:
# Calculate the correlation matrix for the relevant features
corr_matrix = yap_df[['mean_attention_score', 'var_attention_score', 'attention_min', 'weighted_avg_attention', 'label']].corr()

# Plot the heatmap of the correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Attention Score Features')
plt.show()

In [None]:
# Select features and target variable
features = ['mean_attention_score', 'var_attention_score', 'attention_min', 'weighted_avg_attention']
X = yap_df[features]
y = yap_df['label']  # Assuming 'label' is your target column

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Predict on the test set
y_pred = logreg.predict(X_test)

# Evaluate the model
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Calculate AUC-ROC
y_pred_prob = logreg.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"AUC-ROC: {roc_auc:.4f}")

# Feature importance (coefficients)
feature_importance = pd.DataFrame({'Feature': features, 'Coefficient': logreg.coef_[0]})
feature_importance['Abs_Coefficient'] = feature_importance['Coefficient'].abs()
feature_importance = feature_importance.sort_values(by='Abs_Coefficient', ascending=False)

print("\nFeature Importance based on Coefficients:")
print(feature_importance)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.bar(feature_importance['Feature'], feature_importance['Abs_Coefficient'])
plt.title('Feature Importance based on Logistic Regression Coefficients')
plt.xlabel('Feature')
plt.ylabel('Absolute Coefficient Value')
plt.show()

# Check for multicollinearity using Variance Inflation Factor (VIF)
X_with_constant = sm.add_constant(X)
vif_data = pd.DataFrame()
vif_data["Feature"] = X_with_constant.columns
vif_data["VIF"] = [variance_inflation_factor(X_with_constant.values, i) for i in range(X_with_constant.shape[1])]

print("\nVariance Inflation Factor (VIF) for Each Feature:")
print(vif_data)
