In [None]:
import pandas as pd 
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
import seaborn as sns
import os

<b> Objective </b> : Develop at least three insights and present your findings in the simplest way possible (correct visualization). Example of a good insight could be, showing that students have a much higher level stress when they are in prefinal or final year.


## Loading dataset

In [None]:
#importing csv file
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/student-mental-health/Student Mental health.csv')

## Information about dataset

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info

In [None]:
df.head(5)

## Data Preprocessing 

In [None]:
df1=df.dropna() #dropping null values

In [None]:
df1.shape

In [None]:
#renaming column names
df1.rename(columns = {'Choose your gender':'gender',
                      'What is your course?':'course',
                      'Your current year of Study':'Year',
                      'What is your CGPA?':'CGPA',
                      'Do you have Depression?':'depression',
                      'Do you have Anxiety?':'anxiety',
                      'Do you have Panic attack?':'panic attack',
                      'Did you seek any specialist for a treatment?':'specialist treatment'}, inplace = True)
df1.head()

In [None]:
#dropping timestamp column
df2= df1.drop(['Timestamp'],axis='columns')
df2.head()

In [None]:
df2['Year'].unique()

In [None]:
df2['CGPA'].unique()

In [None]:
#removing whitespcae at beg and end in CGPA 
df2['CGPA']= df2['CGPA'].apply(lambda x: x.strip())

In [None]:
#converting float values in Age column to int
df2['Age'] = df2['Age'].astype(int) 

In [None]:
#make new column year with numeric values
df3 = df2.copy()
df3['year']= df2['Year'].apply(lambda x: int(x.split(' ')[1])) #creating new column year 
df3= df3.drop(['Year'],axis='columns')   # dropping old column Year
df3.head()

In [None]:
df3['Age'].unique()

In [None]:
df3_male = df3[df3['gender']=='Male']
df3_male.head()

In [None]:
df3_male['Age'].unique()

In [None]:
df3_female = df3[df3['gender']=='Female']
df3_female.head()

In [None]:
df3_male['Age'].unique()

## Data Visualisation and Insights 

In [None]:
#Pie chart showing percentage of male and female students 
plt.figure(figsize=(12,6))
plt.title("Visual distribution of gender in pie chart")
g = plt.pie(df3['gender'].value_counts(),autopct='%1.2f%%', labels=df3['gender'].value_counts().index);
plt.legend()
plt.show()

print('Number of male and female: \n',df3.groupby('gender')['gender'].agg('count').sort_values(ascending=False))

From the pie chart we can infer that the ratio of female students to that of male students is 3:1.

In [None]:
# histograms showing the count of students of different age w.r.t gender/depression/anxiety/panic attack
fig, axes = plt.subplots(2, 2, figsize=(15,15))
sns.countplot(data = df3,x='Age',hue='gender', ax=axes[0][0])
axes[0][0].set_title('Distribuition of students agewise')

sns.countplot(data=df3,x='Age',hue='depression', ax=axes[0][1])
axes[0][1].set_title('Distribuition of depression among students agewise')

sns.countplot(data=df3,x='Age',hue='anxiety', ax=axes[1][0])
axes[1][0].set_title('Distribuition of anxiety among students agewise')

sns.countplot(data=df3,x='Age',hue='panic attack', ax=axes[1][1])
axes[1][1].set_title('Distribuition of panic attack among students agewise')

From these graphs we can observe that - 
- Maximum number of students having depression are - 18 year old students, followed by 19, 24, 23, 20, 22 year old
- Maximum number of students having anxiety are - 18 year old students, followed by 24, 19, 20, 23, 21 year old
- Maximum number of students having panic attack are - 18 year old students, followed by 24, 19, 23, 20, 21, 22 year old

<br>We can conclude that, 18 year old are more prone to depression, anxiety and panic attacks and 21 year old are least prone.

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15,15))
sns.countplot(data =df3, x='year',hue='gender', ax=axes[0][0])
axes[0][0].set_title('Distribuition of students yearwise')
sns.countplot(data=df3,x='year',hue='depression', ax=axes[0][1])
axes[0][1].set_title('Distribuition of depression among students yearwise')
sns.countplot(data=df3,x='year',hue='anxiety', ax=axes[1][0])
axes[1][0].set_title('Distribuition of anxiety among students yearwise')
sns.countplot(data=df3,x='year',hue='panic attack', ax=axes[1][1])
axes[1][1].set_title('Distribuition of panic attack among students yearwise')

From these graphs we can observe that - 
- 1st year students are most depressed, followed by 2nd, 3rd, and 4th.
- 1st year students are most anxious, followed by 2nd, 3rd, and 4th.
- 1st year students are more prone to panic attacks, followed by 3rd, 2nd, and 4th.

We can conclude that, maximum students are from 1st year and least from 4th year. 
1st year students are more prone to depression,axiety and panic attack,whereas 4th year students are less prone.

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15,15))
sns.countplot(data = df3,x='gender',hue='gender', ax=axes[0][0])
axes[0][0].set_title('Distribuition of students genderwise')
sns.countplot(data=df3,x='gender',hue='depression', ax=axes[0][1])
axes[0][1].set_title('Distribuition of depression among students genderwise')
sns.countplot(data=df3,x='gender',hue='anxiety', ax=axes[1][0])
axes[1][0].set_title('Distribuition of anxiety among students genderwise')
sns.countplot(data=df3,x='gender',hue='panic attack', ax=axes[1][1])
axes[1][1].set_title('Distribuition of panic attack among students genderwise')

From these graphs we can observe that - 
- Number of female students is greater than that of male students
- More than half of the female students have depression, but less than half of the male students have depression.
- More than half of the male students have anxiety, but less than half of the female students have anxiety.
- Male and female students are equally prone to panic attacks.

We can conclulde that, depression is more among female students, and anxiety is more among male students.

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15,15))
sns.countplot(data = df3,x='CGPA',hue='gender', ax=axes[0][0])
axes[0][0].set_title('Distribuition of students according to CGPA')
sns.countplot(data=df3,x='CGPA',hue='depression', ax=axes[0][1])
axes[0][1].set_title('Distribuition of depression among students according to CGPA')
sns.countplot(data=df3,x='CGPA',hue='anxiety', ax=axes[1][0])
axes[1][0].set_title('Distribuition of anxiety among students according to CGPA')
sns.countplot(data=df3,x='CGPA',hue='panic attack', ax=axes[1][1])
axes[1][1].set_title('Distribuition of panic attack among students according to CGPA')

From these graphs we can observe that - 
- more than half the students having CGPA of 3.00-3.49 have depression.
- maximum students facing anxiety have a CGPA of 3.50-4.00, followed by 3.00-3.49, and 2.50-2.99
- more than half of the students having CGPA of 3.50-4.00 have panic attacks.

We can conclude that, students with higher CGPA experience depression,anxiety, and panic attacks more compared to those with lesser CGPA.

In [None]:
depression_stats = df3.groupby('depression')['depression'].agg('count').sort_values(ascending=False)
anxiety_stats = df3.groupby('anxiety')['anxiety'].agg('count').sort_values(ascending=False)
panic_att_stats = df3.groupby('panic attack')['panic attack'].agg('count').sort_values(ascending=False)
treatment_stats = df3.groupby('specialist treatment')['specialist treatment'].agg('count').sort_values(ascending=False)
print(depression_stats,'\n\n',anxiety_stats,'\n\n',panic_att_stats,'\n\n',treatment_stats)

In [None]:
plt.figure(figsize=(10,10))
ax1 = plt.subplot2grid((2,2),(0,0))
plt.pie(df3['depression'].value_counts(),autopct='%1.2f%%', labels=df3['depression'].value_counts().index);
plt.title('Visual representation of students having depression')
plt.legend()
plt.show()

plt.figure(figsize=(10,10))
ax1 = plt.subplot2grid((2,2),(0,1))
plt.pie(df3['anxiety'].value_counts(),autopct='%1.2f%%', labels=df3['anxiety'].value_counts().index);
plt.title('Visual representation of students having anxiety')
plt.legend()

plt.figure(figsize=(10,10))
ax1 = plt.subplot2grid((2,2),(0,1))
plt.pie(df3['panic attack'].value_counts(),autopct='%1.2f%%', labels=df3['panic attack'].value_counts().index);
plt.title('Visual representation of students having panic attack')
plt.legend()

plt.figure(figsize=(10,10))
ax1 = plt.subplot2grid((2,2),(1,1))
plt.pie(df3['specialist treatment'].value_counts(),autopct='%1.2f%%', labels=df3['specialist treatment'].value_counts().index);
plt.title('Visual representation of students taking specialist treatment')
plt.legend()

These pie charts show the percentage of students having depression, anxiety, panic attacks and those who have taken specialist treatment.

## Label Encoding 

In [None]:
#label encoding
df4=df3.copy()
df4.replace(('Yes','No'),(1,0),inplace=True)
df4.head()

In [None]:
#pie chart showing students of different courses before dimentionality reduction
plt.pie(df4['course'].value_counts(),autopct='%1.2f%%', labels=df4['course'].value_counts().index);
plt.title('Visual representation of students different courses')
plt.show()

In [None]:
course_stats = df4.groupby('course')['course'].agg('count').sort_values(ascending=False)
course_stats

In [None]:
#creating category other for course
course_stats_less_than_4 = course_stats[course_stats<=4]
course_stats_less_than_4

In [None]:
df4.course = df4.course.apply(lambda x: 'other' if x in course_stats_less_than_4 else x) 

In [None]:
df4.head()

In [None]:
#one hot encoding

#creating dummy variables
dummies = pd.get_dummies(df4.course)  
dummies.head()

In [None]:
#append dummy variable dataframe to main df
df5 = pd.concat([df4,dummies.drop('other',axis='columns')],axis='columns')
df5 = df5.drop('course',axis='columns')
df5.head()

In [None]:
#plotting perecentage of students in various courses 
plt.pie(dummies.value_counts(),autopct='%1.2f%%', labels=dummies.columns)
plt.title('Visual representation of different courses of students')
plt.show()

## Conclusion 

- 18 year old are more prone to depression, anxiety and panic attacks and 21 year old are the least prone.
- 1st year students are more prone to depression, anxiety and panic attacks, whereas 4th year students are less prone.
- Depression is more among female students, and anxiety is more among male students.
- Students with higher CGPA experience depression, anxiety, and panic attacks more often as compared to those with lesser CGPA.