In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
import seaborn as sns
from scipy.stats import pearsonr
import scipy.stats as stats

# Database connection details
DB_NAME = "student_mental_health"
DB_USER = "postgres"
DB_PASSWORD = "postgres"
DB_HOST = "localhost"
DB_PORT = "5432"

# Create database engine
engine = create_engine(f'postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}')

# Query to load data from the table
query = "SELECT * FROM student_mental_health"

# Load data into pandas DataFrame
df = pd.read_sql(query, engine)

In [3]:
print(df.head())

print(df.describe())

print(df.isnull().sum())

df = df.dropna(how='any', axis=0)

print(df.isnull().sum())

        Timestamp Choose your gender   Age What is your course?  \
0  8/7/2020 12:02             Female  18.0          Engineering   
1  8/7/2020 12:04               Male  21.0    Islamic education   
2  8/7/2020 12:05               Male  19.0                  BIT   
3  8/7/2020 12:06             Female  22.0                 Laws   
4  8/7/2020 12:13               Male  23.0         Mathemathics   

  Your current year of Study What is your CGPA? Marital status  \
0                     year 1        3.00 - 3.49             No   
1                     year 2        3.00 - 3.49             No   
2                     Year 1        3.00 - 3.49             No   
3                     year 3        3.00 - 3.49            Yes   
4                     year 4        3.00 - 3.49             No   

  Do you have Depression? Do you have Anxiety? Do you have Panic attack?  \
0                     Yes                   No                       Yes   
1                      No                  Yes  

In [4]:
# Renaiming columns
df.rename(columns = {'Choose your gender': 'gender',
                     'What is your course?': 'course', 
                     'Your current year of Study': 'year',
                     'Age': 'age',
                     'What is your CGPA?': 'cgpa', 
                     'Marital status': 'marital_status', 
                     'Do you have Depression?': 'depression', 
                     'Do you have Anxiety?': 'anxiety',
                     'Do you have Panic attack?': 'panic_attack', 
                     'Did you seek any specialist for a treatment?': 'seek_any_specialist_for_treatment'}, 
          inplace=True) 
print( df.info() )

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 100
Data columns (total 11 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Timestamp                          100 non-null    object 
 1   gender                             100 non-null    object 
 2   age                                100 non-null    float64
 3   course                             100 non-null    object 
 4   year                               100 non-null    object 
 5   cgpa                               100 non-null    object 
 6   marital_status                     100 non-null    object 
 7   depression                         100 non-null    object 
 8   anxiety                            100 non-null    object 
 9   panic_attack                       100 non-null    object 
 10  seek_any_specialist_for_treatment  100 non-null    object 
dtypes: float64(1), object(10)
memory usage: 9.4+ KB
None


In [5]:

df['year'] = df['year'].str.lower().str.capitalize()
years_of_study = df['year'].unique()
genders = df['gender'].unique()


In [None]:
df['CGPA_gender'] = df['cgpa'].apply(
    lambda x: (float(x.split('-')[0].strip()) + float(x.split('-')[1].strip())) / 2)

# Change the type of the column to float
df['CGPA_gender'] = df['CGPA_gender'].astype(float)

print( df.info() )

In [7]:
total_students = len(df)
anxiety_count = len(df[df['anxiety'] == 'Yes'])
depression_count = len(df[df['depression'] == 'Yes'])
both_count = len(df[(df['anxiety'] == 'Yes') & (df['depression'] == 'Yes')])

anxiety_count_percentage = (anxiety_count / total_students) * 100
depression_count_percentage = (depression_count / total_students) * 100
both_count_percentage = (both_count / total_students) * 100

In [None]:
avg_cgpa = []

for year in years_of_study:
    year_data = {'year': year}
    for gender in genders:
        gender_year_df = df[(df['year'] == year) & (df['gender'] == gender)]

        if gender_year_df.empty:
            print(f"No data for {gender} students in year {year}")
            continue

        avg_cgpa_value = gender_year_df['CGPA_gender'].mean()
        year_data[f'{gender.lower()}_avg'] = float("{:.2f}".format(avg_cgpa_value))
        print(f"Average CGPA for {gender} students: {avg_cgpa_value:.2f}")

    if 'female_avg' not in year_data:
        year_data['female_avg'] = None
    if 'male_avg' not in year_data:
        year_data['male_avg'] = None

    avg_cgpa.append(year_data)

avg_cgpa

In [None]:
plt.figure(figsize=(10, 10))
plt.hist(df['age'], color='b')
plt.title("Age distribution");

In [None]:
plt.figure(figsize=(12, 6))
plt.title("gender distribution")
g = plt.pie(df.gender.value_counts(), explode=(0.025, 0.025), labels=df.gender.value_counts().index,
            colors=['skyblue', 'navajowhite'], autopct='%1.1f%%', startangle=180);
plt.legend()
plt.show()

In [None]:
df['gender']

In [None]:
plt.figure(figsize=(10, 10))
sns.countplot(data=df, x='year', hue='gender')
plt.title("Students studyig in particular year");

In [None]:
plt.figure(figsize=(10, 10))
sns.countplot(data=df, x='anxiety', hue='depression')
plt.title("Students studyig in particular year");
plt.show()

In [None]:
plt.figure(figsize=(10, 10))
sns.set_theme(style="darkgrid")
ax = sns.countplot(y="anxiety", hue="gender", data=df)
plt.title("Anxiety by Gender")
plt.show()

In [None]:
plt.figure(figsize=(10, 10))
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="anxiety", hue="year", data=df)
plt.title("Anxiety by study year")
plt.show()


In [None]:
plt.figure(figsize=(10, 10))
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="depression", hue="year", data=df)
plt.title("Depression by study year")
plt.show()

### P-Value Analysis
###### We wanted to calculate a p-value between two columns but we only had the age column that is numeric.
###### In order to create a second column that is numeric we took the 'What is your CGPA' Column and through a lambda function we calculated the mean value of the two string values
###### After that we changed the 'What is your CGPA' to a float 
###### In the correlations analysis we compare the age and the cgpa of a student. The result is a very high p-value which indicates between this two variables there is no statistical significant correlation

In [11]:
# Calculate the mean value between the two values in the 'What is your CGPA?' column
df['cgpa'] = df['cgpa'].apply(
    lambda x: (float(x.split('-')[0].strip()) + float(x.split('-')[1].strip())) / 2)

# Change the type of the column to float
df['cgpa'] = df['cgpa'].astype(float)

# Calculate the correlation and p-value
correlation, p_value = pearsonr(df['age'], df['cgpa'])

# Print the correlation and p-value
print("Correlation:", correlation)
print("P-value:", p_value)

Correlation: 0.00561008761560407
P-value: 0.955823000733376


In [12]:
# Statistical analysis: Chi-square test for independence between Anxiety and Depression
contingency_table = pd.crosstab(df['anxiety'], df['depression'])
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

print(f"Chi-square test results:\nChi2: {chi2}\nP-value: {p}\nDegrees of freedom: {dof}")

Chi-square test results:
Chi2: 6.142876731112027
P-value: 0.013194249179498846
Degrees of freedom: 1


In [14]:
df['angst_numeric'] = df['anxiety'].map({'Yes': 1, 'No': 0})
df['depression_numeric'] = df['depression'].map({'Yes': 1, 'No': 0})

# Filtere Zeilen mit CGPA < 3
low_cgpa_df = df[df['cgpa'] < 3]
 
# Berechne den Korrelationskoeffizienten
correlation, p_value = pearsonr(low_cgpa_df['angst_numeric'], low_cgpa_df['cgpa'])
 
print(f"Korrelationskoeffizient (Angst vs. schwache CGPA): {correlation:.2f}")
print(f"P-Wert: {p_value:.4f}")
 
# Interpretation
if p_value < 0.05:
    print("Der Zusammenhang ist statistisch signifikant.")
else:
    print("Es gibt keinen statistisch signifikanten Zusammenhang.")

Korrelationskoeffizient (Angst vs. schwache CGPA): 0.32
P-Wert: 0.4035
Es gibt keinen statistisch signifikanten Zusammenhang.


In [15]:
# Filtere Zeilen mit CGPA < 3
low_cgpa_df = df[df['cgpa'] < 3]
 
# Berechne den Korrelationskoeffizienten
correlation, p_value = pearsonr(low_cgpa_df['depression_numeric'], low_cgpa_df['cgpa'])
 
print(f"Korrelationskoeffizient (Depression vs. schwache CGPA): {correlation:.2f}")
print(f"P-Wert: {p_value:.4f}")
 
# Interpretation
if p_value < 0.05:
    print("Der Zusammenhang ist statistisch signifikant.")
else:
    print("Es gibt keinen statistisch signifikanten Zusammenhang.")

Korrelationskoeffizient (Depression vs. schwache CGPA): 0.64
P-Wert: 0.0650
Es gibt keinen statistisch signifikanten Zusammenhang.


In [16]:
high_cgpa_df = df[df['cgpa'] > 3]
 
# Berechne den Korrelationskoeffizienten
correlation, p_value = pearsonr(high_cgpa_df['depression_numeric'], high_cgpa_df['cgpa'])
 
print(f"Korrelationskoeffizient (Depression vs. starke CGPA): {correlation:.2f}")
print(f"P-Wert: {p_value:.4f}")
 
# Interpretation
if p_value < 0.05:
    print("Der Zusammenhang ist statistisch signifikant.")
else:
    print("Es gibt keinen statistisch signifikanten Zusammenhang.")

Korrelationskoeffizient (Depression vs. starke CGPA): -0.18
P-Wert: 0.0899
Es gibt keinen statistisch signifikanten Zusammenhang.


In [17]:
 
# Annahme: Du hast bereits einen DataFrame namens 'df' mit den relevanten Spalten
 
# Filtere Zeilen mit CGPA > 3
 
high_cgpa_df = df[df['cgpa'] > 3]
 
# Berechne den Korrelationskoeffizienten
correlation, p_value = pearsonr(high_cgpa_df['angst_numeric'], high_cgpa_df['cgpa'])
 
print(f"Korrelationskoeffizient (Angst vs. starke CGPA): {correlation:.2f}")
print(f"P-Wert: {p_value:.4f}")
 
# Interpretation
if p_value < 0.05:
    print("Der Zusammenhang ist statistisch signifikant.")
else:
    print("Es gibt keinen statistisch signifikanten Zusammenhang.")

Korrelationskoeffizient (Angst vs. starke CGPA): 0.03
P-Wert: 0.7982
Es gibt keinen statistisch signifikanten Zusammenhang.


In [18]:
# Berechne den Korrelationskoeffizienten
correlation, p_value = pearsonr(df['angst_numeric'], df['cgpa'])
 
print(f"Korrelationskoeffizient (Angst vs. CGPA): {correlation:.2f}")
print(f"P-Wert: {p_value:.4f}")
 
# Interpretation
if p_value < 0.05:
    print("Der Zusammenhang ist statistisch signifikant.")
else:
    print("Es gibt keinen statistisch signifikanten Zusammenhang.")

Korrelationskoeffizient (Angst vs. CGPA): 0.16
P-Wert: 0.1164
Es gibt keinen statistisch signifikanten Zusammenhang.


In [19]:
# Berechne den Korrelationskoeffizienten
correlation, p_value = pearsonr(df['depression_numeric'], df['cgpa'])
 
print(f"Korrelationskoeffizient (Depression vs. CGPA): {correlation:.2f}")
print(f"P-Wert: {p_value:.4f}")
 
# Interpretation
if p_value < 0.05:
    print("Der Zusammenhang ist statistisch signifikant.")
else:
    print("Es gibt keinen statistisch signifikanten Zusammenhang.")

Korrelationskoeffizient (Angst vs. CGPA): 0.01
P-Wert: 0.8942
Es gibt keinen statistisch signifikanten Zusammenhang.
