In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.gridspec as gridspec

In [None]:
file_path = '../../data/adhd-beliefs-pt/adhd-beliefs-pt-cleaned.pkl'
df = pd.read_pickle(file_path)
df.head()

In [None]:
column_descriptive_names = {
    'special_interest': 'Special Interest',
    'diary_entry': 'Diary Entry',
    'selfdefining_memory': 'Self-Defining Memory',
    'empty_sheet': 'Empty Sheet'
}

In [None]:
# Count non-empty entries and calculate percentages
def count_non_empty_entries(df, column):
    counts = df.groupby('adhd_diagnosis')[column].apply(lambda x: x.notna().sum()).reset_index(name='count')
    # Calculate total counts per diagnosis group
    total_per_diagnosis = df.groupby('adhd_diagnosis').size().reset_index(name='total')
    # Merge the two dataframes
    result = pd.merge(counts, total_per_diagnosis, on='adhd_diagnosis')
    # Calculate percentage
    result['percentage'] = (result['count'] / result['total'] * 100).round(1)
    return result
    
# Create a figure with subplots in a 2x2 grid
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Non-Empty Responses by ADHD Diagnosis Status', fontsize=18, fontweight='bold')

# Flatten axes array for easy iteration
axes = axes.flatten()

# Loop through each column and create subplot
for i, column in enumerate(column_descriptive_names.keys()):
    counts = count_non_empty_entries(df, column)
    
    # Create the bar plot
    sns.barplot(
        x='adhd_diagnosis', 
        y='count', 
        data=counts, 
        palette='rocket',
        hue='adhd_diagnosis',
        ax=axes[i]
    )
    
    # Add percentage annotations
    for j, row in counts.iterrows():
        axes[i].text(
            j, row['count'] + 0.5, 
            f"{row['count']} ({row['percentage']}%)", 
            ha='center', va='bottom', 
            fontsize=10
        )
    
    # Set title and labels
    axes[i].set_title(f'{column_descriptive_names[column]}', fontsize=14, fontweight='bold')
    axes[i].set_xlabel('', fontsize=12)
    axes[i].set_ylabel('Count of Non-Empty Responses', fontsize=12)
    
    # Adjust y-axis limit to make room for annotations
    max_count = counts['count'].max()
    axes[i].set_ylim(0, max_count + max_count * 0.2)
    
    # Rotate x-tick labels for better readability
    axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45, ha='right')
    
    # Add grid for better readability
    axes[i].grid(axis='y', linestyle='--', alpha=0.7)

# Adjust layout
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.savefig('data/non_empty_responses_by_adhd_diagnosis.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Create a stacked bar chart to compare response rates across all question types
plt.figure(figsize=(12, 8))

# Get response rates for all question types
response_data = pd.DataFrame()
for column in column_descriptive_names.keys():
    data = count_non_empty_entries(df, column)
    data['question_type'] = column_descriptive_names[column]
    response_data = pd.concat([response_data, data])

# Create pivot table for plotting
pivot_data = response_data.pivot(
    index='adhd_diagnosis', 
    columns='question_type', 
    values='percentage'
).reset_index()

# Plot stacked bar chart
ax = pivot_data.plot(
    x='adhd_diagnosis',
    y=[column_descriptive_names[col] for col in column_descriptive_names.keys()],
    kind='bar',
    stacked=False,
    figsize=(12, 8),
    width=0.7,
    colormap='rocket'
)

# Customize the plot
plt.title('Response Rate by Question Type and ADHD Diagnosis', fontsize=16, fontweight='bold')
plt.xlabel('ADHD Diagnosis', fontsize=14)
plt.ylabel('Response Rate (%)', fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(title='Questions', fontsize=12, title_fontsize=12)
plt.xticks(rotation=45, ha='right')

# Add value annotations
for container in ax.containers:
    ax.bar_label(container, fmt='%.1f%%', fontsize=10)

# Save and show the plot
plt.tight_layout()
plt.savefig('data/response_rate_by_question_type.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Calculate the count of missing values for the selected columns
missing_counts = df[['special_interest', 'diary_entry', 'selfdefining_memory', 'empty_sheet']].isnull().sum()
missing_percentage = (missing_counts / len(df)) * 100

# Mapping variable names to descriptive text
variable_descriptions = {
    'special_interest': 'Special Interest',
    'diary_entry': 'Diary Entry',
    'selfdefining_memory': 'Self-Defining Memory',
    'empty_sheet': 'Empty Sheet'
}

# Replace variable names with descriptive text
missing_counts.index = missing_counts.index.map(variable_descriptions)
missing_percentage.index = missing_percentage.index.map(variable_descriptions)

# Create a bar plot
plt.figure(figsize=(10, 6))
ax = sns.barplot(x=missing_counts.index, y=missing_counts.values, hue=missing_counts.index, palette='rocket')

# Add value and percentage annotations on top of the bars
for i, (value, percentage) in enumerate(zip(missing_counts.values, missing_percentage.values)):
    ax.text(i, value + 2, f'{value} ({percentage:.1f}%)', ha='center', va='bottom', fontsize=10, color='black')

# Adjust the y-axis limit
ax.set_ylim(0, missing_counts.max() + 15)

# Add meaningful labels and title
plt.title('Number of Missing Values per Open-Ended Question', fontsize=14, fontweight='bold')
plt.xlabel('', fontsize=12)
plt.ylabel('Number of Missing Values', fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.tight_layout()
plt.savefig('data/missing_values.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
from matplotlib.patches import Patch

# Create a heatmap of missing values with improved aesthetics
plt.figure(figsize=(12, 8))
sns.heatmap(
    df[['special_interest', 'diary_entry', 'selfdefining_memory', 'empty_sheet']].isnull(),
    cmap='coolwarm',
    cbar=False,
    linewidths=0.5,
    linecolor='gray'
)

# Add descriptive labels and title
plt.title('Heatmap of Missing Values for Open-Ended Questions', fontsize=16, fontweight='bold')
plt.xlabel('', fontsize=12)
plt.ylabel('Observations', fontsize=12)

# Replace variable names with descriptive text
descriptive_labels = [variable_descriptions.get(col, col) for col in ['special_interest', 'diary_entry', 'selfdefining_memory', 'empty_sheet']]
plt.xticks(ticks=[x + 0.5 for x in range(len(descriptive_labels))], labels=descriptive_labels, rotation=45, fontsize=10, ha='center')

legend_elements = [
    Patch(facecolor='blue', edgecolor='gray', label='Not Missing'),
    Patch(facecolor='red', edgecolor='gray', label='Missing')
]
plt.legend(
    handles=legend_elements,
    loc='upper center',
    bbox_to_anchor=(0.5, -0.25),
    title='Legend',
    fontsize=10,
    ncol=2
)

# Adjust layout and save the figure
plt.tight_layout()
plt.savefig('data/missing_values_heatmap_pretty.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
adhd_counts = df['adhd_diagnosis'].value_counts()
adhd_labels = [f"{label} ({count / adhd_counts.sum() * 100:.1f}%)" for label, count in adhd_counts.items()]

In [None]:
# Define labels for the legends
sex_counts = df['sex'].value_counts()
sex_labels = [f"{label} ({count / sex_counts.sum() * 100:.1f}%)" for label, count in sex_counts.items()]
education_counts = df['education'].value_counts()
education_labels = [f"{label} ({count / education_counts.sum() * 100:.1f}%)" for label, count in education_counts.items()]
occupation_counts = df['occupation'].value_counts()
occupation_labels = [f"{label} ({count / occupation_counts.sum() * 100:.1f}%)" for label, count in occupation_counts.items()]
dialect_counts = df['dialect'].value_counts()
dialect_labels = [f"{label} ({count / dialect_counts.sum() * 100:.1f}%)" for label, count in dialect_counts.items()]

# Create a figure with a uniform grid layout
fig1 = plt.figure(figsize=(14, 10))
gs = gridspec.GridSpec(2, 2, figure=fig1)
fig1.suptitle('Demographic Data Distribution', fontsize=16, fontweight='bold')

# Donut chart for distribution of sex
ax1 = fig1.add_subplot(gs[0, 0])
ax1.pie(
    sex_counts,
    startangle=90,
    colors=sns.color_palette('rocket', len(sex_counts)),
    wedgeprops={'edgecolor': 'black', 'width': 0.4},  # Add width for donut effect
)
ax1.set_title('Distribution of Sex', fontsize=12, fontweight='bold')
ax1.legend(labels=sex_labels, loc='upper left', bbox_to_anchor=(0.9, 0.5), fontsize=10)

# Donut chart for distribution of education
ax2 = fig1.add_subplot(gs[0, 1])
ax2.pie(
    education_counts,
    startangle=90,
    colors=sns.color_palette('rocket', len(education_counts)),
    wedgeprops={'edgecolor': 'black', 'width': 0.4},  # Add width for donut effect
)
ax2.set_title('Distribution of Education', fontsize=12, fontweight='bold')
ax2.legend(labels=education_labels, loc='upper left', bbox_to_anchor=(0.9, 0.5), fontsize=10)

# Donut chart for distribution of occupation
ax4 = fig1.add_subplot(gs[1, 0])
ax4.pie(
    occupation_counts,
    startangle=90,
    colors=sns.color_palette('rocket', len(occupation_counts)),
    wedgeprops={'edgecolor': 'black', 'width': 0.4},  # Add width for donut effect
)
ax4.set_title('Distribution of Occupation', fontsize=12, fontweight='bold')
ax4.legend(labels=occupation_labels, loc='upper left', bbox_to_anchor=(0.9, 0.5), fontsize=10)

# Donut chart for distribution of dialect
ax5 = fig1.add_subplot(gs[1, 1])
ax5.pie(
    dialect_counts,
    startangle=90,
    colors=sns.color_palette('rocket', len(dialect_counts)),
    wedgeprops={'edgecolor': 'black', 'width': 0.4},  # Add width for donut effect
)
ax5.set_title('Distribution of Dialect', fontsize=12, fontweight='bold')
ax5.legend(labels=dialect_labels, loc='upper left', bbox_to_anchor=(0.9, 0.5), fontsize=10)

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.savefig('data/demographic_data_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
for i, count in enumerate(adhd_counts.values):
    print(i, count)

In [None]:
# Make a donut chart of the adhd_diagnosis variable
plt.figure(figsize=(10, 8))

# Create the donut chart
wedges, texts = plt.pie(
    adhd_counts, 
    startangle=90, 
    colors=sns.color_palette('rocket', len(adhd_counts)),
    wedgeprops={'edgecolor': 'black', 'width': 0.4}  # width < 1 creates the donut hole
)

# Add a legend with percentages
plt.legend(
    wedges, 
    [f"{label} ({count / adhd_counts.sum() * 100:.1f}%)" for label, count in zip(adhd_counts.index, adhd_counts)],
    loc='upper right',
    bbox_to_anchor=(1, 0.5),
    fontsize=12
)

# Add title and styling
plt.title('Distribution of ADHD Diagnosis', fontsize=16, fontweight='bold')
plt.axis('equal')  # Equal aspect ratio ensures the pie chart is circular

# Save the figure
plt.tight_layout()
plt.savefig('data/adhd_diagnosis_distribution_donut.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Create a single figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Subplot 1: Age distribution (Histogram)
sns.histplot(df['age'], bins=20, kde=True, color=sns.color_palette('rocket', 1)[0], ax=axes[0])
axes[0].set_title('Histogram', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Age', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].grid(axis='y', linestyle='--', alpha=0.7)

# Subplot 2: Age distribution (Boxplot)
sns.boxplot(x=df['age'], color=sns.color_palette('rocket', 1)[0], ax=axes[1])
mean_age = df['age'].mean()
axes[1].axvline(mean_age, color=sns.color_palette('rocket', 1)[0], linestyle='--', label=f'Mean: {mean_age:.1f}')
axes[1].set_title('Boxplot', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Age', fontsize=12)
axes[1].grid(axis='y', linestyle='--', alpha=0.7)
axes[1].legend()

# Add a general title
fig.suptitle('Age Distribution', fontsize=16, fontweight='bold')

# Adjust layout and save the figure
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.savefig('data/combined_age_adhd_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Create a single figure with one column for all boxplots
fig, ax = plt.subplots(figsize=(24, 8))
fig.suptitle('Text Length Distribution for Open-Ended Questions', fontsize=18, fontweight='bold')

# Combine all boxplots into one vertical graph
sns.boxplot(
    data=[
        df['special_interest'].str.len(),
        df['diary_entry'].str.len(),
        df['selfdefining_memory'].str.len(),
        df['empty_sheet'].str.len()
    ],
    palette=sns.color_palette('rocket', 4),
    orient='h',
    ax=ax
)

# Set custom labels for each boxplot
ax.set_yticklabels(descriptive_labels, fontsize=12)
ax.set_xlabel('Text Length', fontsize=14)
ax.set_ylabel('', fontsize=14)
ax.grid(axis='x', linestyle='--', alpha=0.7)

# Adjust layout and save the figure
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.savefig('data/text_length_distribution_combined_vertical_boxplots.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Set a more visually appealing theme
sns.set_theme(style="whitegrid", context="talk")

# Create the bar plot with improved aesthetics
values = [df.shape[0], df.shape[1]]  # Number of records and variables
categories = ['Records', 'Variables']

fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x=categories, y=values, palette='rocket', ax=ax)

# Add labels and title with improved font sizes and weights
ax.set_ylabel('Count', fontsize=16, fontweight='bold')
ax.set_xlabel('', fontsize=14)
ax.set_title('Number of Records and Variables', fontsize=18, fontweight='bold')

# Annotate the bars with values
for i, value in enumerate(values):
    ax.text(i, value + 2, f'{value}', ha='center', va='bottom', fontsize=14, color='black')

# Set the y-axis limit to ensure the bars fit within the plot
ax.set_ylim(0, max(values) + 15)

# Add a grid for better readability
ax.grid(axis='y', linestyle='--', alpha=0.7)

# Adjust layout and display the plot
plt.tight_layout()
plt.savefig('data/number_of_records_and_variables.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Standardize the data (if not already standardized)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df.select_dtypes(include=[np.number]))

# Perform PCA
pca = PCA()
pca.fit(df_scaled)

# Calculate the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

# Create the Scree Plot
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', linestyle='--', color='b')
plt.title('Scree Plot (Elbow Plot)', fontsize=16)
plt.xlabel('Principal Component', fontsize=14)
plt.ylabel('Explained Variance Ratio', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(alpha=0.5)
plt.savefig('data/scree_plot.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Drop the specified columns
columns_to_ignore = ['sex', 'adhd_diagnosis', 'age', 'education', 'occupation', 'dialect', 
                     'special_interest', 'diary_entry', 'selfdefining_memory', 'empty_sheet']
df_filtered = df.drop(columns=columns_to_ignore)

feature_map = {
    'forgetting_objects': 'Forgetting Objects',
    'forgetting_responsabilities': 'Forgetting Responsibilities',
    'emotion_management': 'Emotion Management',
    'emotion_reactions': 'Emotion Reactions',
    'emotion_choices': 'Emotion Choices',
    'emotion_intense_reaction': 'Intense Emotional Reactions',
    'strong_mood_swings': 'Strong Mood Swings',
    'control_mood_swings': 'Control Over Mood Swings',
    'mood_swings_outside_consequence': 'Mood Swings with Consequences',
    'justice_sense': 'Sense of Justice',
    'defend_beliefs': 'Defending Beliefs',
    'express_writing': 'Expressing Through Writing',
    'topic_change_involuntary': 'Involuntary Topic Changes',
    'related_topic': 'Related Topics',
    'topic_change_unrealized': 'Unrealized Topic Changes',
    'parallel_topic': 'Parallel Topics',
    'nonlinear_storytelling': 'Nonlinear Storytelling',
    'excessive_details': 'Excessive Details',
    'forgetting_mid_conversation': 'Forgetting Mid-Conversation',
    'talkback': 'Talkback',
    'talkback_authority': 'Talkback to Authority',
    'talkback_comfortable': 'Comfortable Talkback',
    'interrupting_involuntary': 'Involuntary Interrupting',
    'excited_opinion': 'Excited Opinions',
    'want_interrupt': 'Want to Interrupt',
    'want_interrupt_control': 'Control Over Interrupting',
    'called_out_expressing': 'Called Out for Expressing',
    'called_out_exalted': 'Called Out for Being Exalted',
    'called_out_talking_loud': 'Called Out for Talking Loudly',
    'called_out_nonlinear_talk': 'Called Out for Nonlinear Talk',
    'called_out_nonlinear_write': 'Called Out for Nonlinear Writing',
    'called_out_gestures': 'Called Out for Gestures',
    'called_out_talking': 'Called Out for Talking',
    'detailed_opinions': 'Detailed Opinions',
    'detailed_opinions_interest': 'Detailed Opinions on Interests',
    'detailed_opinions_people': 'Detailed Opinions on People',
    'detailed_opinions_misunderstood': 'Misunderstood Detailed Opinions',
    'swearing': 'Swearing',
    'swearing_anywhere': 'Swearing Anywhere',
    'swearing_casual': 'Casual Swearing',
    'talk_fast': 'Talking Fast',
    'talk_fast_unrealized': 'Unrealized Fast Talking',
    'talk_fast_uncontrolled': 'Uncontrolled Fast Talking',
    'called_out_talk_fast': 'Called Out for Talking Fast',
    'need_fast_talk_interest': 'Need for Fast Talking on Interests',
    'need_fast_talk_information': 'Need for Fast Talking on Information',
    'speaking_before_thinking': 'Speaking Before Thinking',
    'something_to_add': 'Something to Add',
    'something_to_add_timid': 'Timid Additions',
    'something_to_add_impulsive': 'Impulsive Additions'
}

# Rename columns in the dataframe for better readability
df_filtered_renamed = df_filtered.rename(columns=feature_map)

# Generate the correlation heatmap with improved aesthetics
plt.figure(figsize=(14, 12))
heatmap = sns.heatmap(df_filtered_renamed.corr(), annot=False, cmap='coolwarm', square=True, cbar_kws={"shrink": .8})

# Add a prettier title with Seaborn's title styling
plt.title('Correlation Heatmap of Filtered Data', fontsize=20, pad=20, weight='bold', color='black')

# Customize tick labels for better readability
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Adjust layout and save the plot
plt.tight_layout()
plt.savefig('data/correlation_heatmap_pretty.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Make a plot of the ages per ADHD diagnosis status with inverted axes
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, y='adhd_diagnosis', x='age', hue='adhd_diagnosis', palette='rocket')
plt.title('Age Distribution by ADHD Diagnosis', fontsize=16, fontweight='bold')
plt.ylabel('', fontsize=14, labelpad=30)  # Add padding to the y-axis label
plt.xlabel('Age', fontsize=14, labelpad=20)  # Add padding to the x-axis label
plt.yticks(fontsize=12)
plt.xticks(fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('data/age_distribution_adhd_diagnosis.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Filter data by sex
male_data = df[df['sex'] == 'Masculino']['adhd_diagnosis'].value_counts()
female_data = df[df['sex'] == 'Feminino']['adhd_diagnosis'].value_counts()

# Calculate percentages for male and female data
male_labels = [f"{label} ({count / male_data.sum() * 100:.1f}%)" for label, count in male_data.items()]
female_labels = [f"{label} ({count / female_data.sum() * 100:.1f}%)" for label, count in female_data.items()]

# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 7))

# Donut chart for males
axes[0].pie(
    male_data,
    startangle=90,
    colors=sns.color_palette('rocket', len(male_data)),
    wedgeprops={'edgecolor': 'black', 'width': 0.4},  # Donut effect
)
axes[0].set_title('ADHD Diagnosis (Males)', fontsize=14, fontweight='bold')
axes[0].legend(labels=male_labels, loc='upper left', bbox_to_anchor=(1, 0.5), fontsize=10)

# Donut chart for females
axes[1].pie(
    female_data,
    startangle=90,
    colors=sns.color_palette('rocket', len(female_data)),
    wedgeprops={'edgecolor': 'black', 'width': 0.4},  # Donut effect
)
axes[1].set_title('ADHD Diagnosis (Females)', fontsize=14, fontweight='bold')
axes[1].legend(labels=female_labels, loc='upper left', bbox_to_anchor=(1, 0.5), fontsize=10)

# Adjust layout and display the plot
plt.tight_layout()
plt.savefig('data/adhd_diagnosis_by_sex.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Calculate percentages within each gender group
male_percentages = (male_data / male_data.sum() * 100).round(1)
female_percentages = (female_data / female_data.sum() * 100).round(1)

# Create a DataFrame for plotting percentages
gender_adhd_percent_df = pd.DataFrame({
    'Masculino': male_percentages,
    'Feminino': female_percentages
})

# Create the plot with horizontal bars
plt.figure(figsize=(12, 8))
ax = gender_adhd_percent_df.plot(kind='barh', color=sns.color_palette('rocket', 2))

# Add title and labels
plt.title('ADHD Diagnosis Status by Gender (Percentage)', fontsize=16, fontweight='bold')
plt.ylabel('ADHD Diagnosis Status', fontsize=14)
plt.xlabel('Percentage (%)', fontsize=14)

# Customize grid and rotation of labels
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.yticks(rotation=0)  # No rotation needed for horizontal bars

# Add value labels on each bar
for container in ax.containers:
    ax.bar_label(container, fmt='%.1f%%', fontsize=10, padding=3)

# Add legend
plt.legend(title='Gender', fontsize=12, title_fontsize=14, loc='center left', bbox_to_anchor=(1.0, 0.5))

# Adjust layout and save
plt.tight_layout()
plt.savefig('data/adhd_diagnosis_by_gender_percentage_barplot_horizontal.png', dpi=300, bbox_inches='tight')
plt.show()