## Library Import

In [None]:
#Importing required libraries
#Data Exploration
import numpy as np
import pandas as pd

#Data Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

## Export Project File

In [None]:
## If you are using Colab as your notebook use the following section for file access
# from google.colab import files
# uploaded = files.upload()

## Data Exploration

### Read File using Pandas

In [None]:
#Read csv file of Student performance data and store as dataframe
stud_df = pd.read_csv('../input/students-performance-in-exams/StudentsPerformance.csv')

In [None]:
# Observe the Datatypes, Missing values and Count (Features,Observation)
stud_df.info()

In [None]:
# Rename the columns
# It conatains some special character like  ('/',' ')
stud_df = stud_df.rename(columns={'gender':'gender',
                             'race/ethnicity':'social_group',
                             'parental level of education':'parent_education',
                            'lunch':'lunch',
                            'test preparation course':'test_prep_course',
                            'math score':'math_score',
                            'reading score':'read_score',
                            'writing score':'write_score'})

In [None]:
# Modified column names
stud_df.columns

#### Observation
1. Total **1000** students observations are collected against **7 features**. In that **5** object variables and **3** integer variables. 
2. It is identified that dataset is having **no missing values**





In [None]:
stud_df.head(5)

### Explore Categorical Variables

In [None]:
# Observation for Categorical/Object variables
unique_df = pd.DataFrame(columns=['VariableName','UniqueValues'])
for i,col in enumerate(stud_df.columns):
  if str(stud_df[col].dtype) == 'object':
    unique_df.loc[i,['VariableName','UniqueValues']] = [col, ", ".join(stud_df[col].unique())]

In [None]:
# Visualise unique value of Categorical variables
pd.options.display.max_colwidth = 100
unique_df

#### Observation
It is observed that unique value for variables,

|VariableName|UniqueValues|
|--------|---------|
|gender	|female, male|
|social_group|group B, group C, group A, group D, group E|
|parent_education|bachelor's degree, some college, master's degree, associate's degree, high school, some high school|
|lunch|standard, free/reduced|
|test|preparation course	none, completed|


### Explore Numerical Variables

In [None]:
stud_df.describe()

#### Observation

|VariableName|Min|Max|Mean|
|:-----|:----:|:----:|:----:|
|math score|0|100|66.09|
reading score|17|100|69.17|
|writing score|10|100|68.05|

## Data Visualisation

### Common Functions for Visualisation

In [None]:
def cat_num_plot(cat_variable,variable,df,sp_value=None):
    if sp_value == None:
        col_val=2
    else:
        col_val=3
    fig, axes = plt.subplots(figsize=(20,6),ncols=col_val,nrows=1)
    for uniq in df[cat_variable].unique():
        # General comparison
        distplt = sns.distplot(df[df[cat_variable]==uniq][variable],label=uniq,ax=axes[0])
        title = variable+' vs '+cat_variable
    distplt.set(title=title)
    if sp_value == None:
        boxplt1 = sns.boxplot(x=variable,y=cat_variable,data=df,ax=axes[1])
        title = variable+' vs '+cat_variable
        boxplt1.set(title = title)
        prev_labels = [x.get_text() for x in axes[1].get_yticklabels(which='major')]
        new_labels = [x+'('+str(len(df[(df[cat_variable]==x)]))+')' for x in prev_labels]
        boxplt1.set_yticklabels(labels = new_labels,fontsize=10)
    else:
        # Greater than or equal to setpoint value comparison
        boxplt1 = sns.boxplot(x=variable,y=cat_variable,data=df[df[variable] >= sp_value],ax=axes[1])
        title = variable+' >='+str(sp_value)+' vs '+cat_variable
        boxplt1.set(title = title)
        prev_labels = [x.get_text() for x in axes[1].get_yticklabels(which='major')]
        new_labels = [x+'('+str(len(df[(df[variable]>=sp_value) & (df[cat_variable]==x)]))+')' for x in prev_labels]
        boxplt1.set_yticklabels(labels = new_labels,fontsize=10)
        # Less than or equal to setpoint value comparison
        boxplt2 = sns.boxplot(x=variable,y=cat_variable,data=df[df[variable] <= sp_value],ax=axes[2])
        title = variable+' <='+str(sp_value)+' vs '+cat_variable
        boxplt2.set(title = title)
        prev_labels = [x.get_text() for x in axes[2].get_yticklabels(which='major')]
        new_labels = [x+'('+str(len(df[(df[variable]<=sp_value) & (df[cat_variable]==x)]))+')' for x in prev_labels]
        boxplt2.set_yticklabels(labels = new_labels,fontsize=10)
    plt.subplots_adjust(left=None, bottom=0.1, right=None, top=0.9, wspace=0.4, hspace=0.4)
    plt.show()

### Univariate Analysis Categorical Variable

In [None]:
numerical_cols = stud_df.select_dtypes('int64').columns
categorical_cols = stud_df.select_dtypes('object').columns

In [None]:
plt.figure(figsize=(20,20))
for i in range(len(categorical_cols)):
  plt.subplot(len(categorical_cols),1,i+1)
  plots = sns.countplot(x=categorical_cols[i], data= stud_df)
  for bar in plots.patches:
   
  # Using Matplotlib's annotate function and
  # passing the coordinates where the annotation shall be done
  # x-coordinate: bar.get_x() + bar.get_width() / 2
  # y-coordinate: bar.get_height()
  # free space to be left to make graph pleasing: (0, 8)
  # ha and va stand for the horizontal and vertical alignment
    plots.annotate(format(bar.get_height(), '.0f'),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()/2), ha='center', va='center',
                   size=14, xytext=(0, 8),
                   textcoords='offset points')
  plt.title('Count Plot for '+categorical_cols[i])
  plt.subplots_adjust(left=None,
                    bottom=0.1, 
                    right=None, 
                    top=0.9, 
                    wspace=0.4, 
                    hspace=0.4)
plt.show()

#### Observation
1. Female students strength is more in the dataset, **518** students are *female* and **482** students are *male*
2. *Group c* type students are **less** in ethnicity and *Group b* is more.
3. Very **few** parents had done *Master's Degree* (**0.06%**) and **most** of the parents had done *some college course and associate degree* (**22%** per category)
4. Most of the students opted for *Standard* lunch methodology, around **64.5%** students
5. Very less had completed *Test Preparation Course*, it is around **35.8%** and remaining **64.2%** haven't taken the preparation.

In [None]:
#Boxplot Visualisation
plt.figure(figsize=(8,8))
sns.boxplot(data=stud_df)

### Bivariate Analysis

#### Categorical and Numerical Variable

For Bivariate Analysis, I considered all numerical variables (**Math score, Write score and Read score**) against categorical variable (**Gender and Social Group**). In the upcoming Bivariate visualisation, score values are considered as two categories based on set point values, here the value for setpoint is **50** because all score variables are having maximum value as **100**, taken half of the value as setpoint for initial classification

In [None]:
score = ['math_score','read_score','write_score']

In [None]:
# Analysis for Gender and Score variables
for s in score:
  cat_num_plot('gender',s,stud_df,50)

In [None]:
# Analysis for Race/Ethnicity and Score variables
for s in score:
  cat_num_plot('social_group',s,stud_df,50)

In [None]:
for s in score:
  cat_num_plot('test_prep_course',s,stud_df,50)

In [None]:
for s in score:
  cat_num_plot('parent_education',s,stud_df,50)

#### Observation
**Gender/Race/Testpreparation and Score Variables**:

|score_name|setpoint_value|Remarks|
|:-----|:-----:|-----|
math score|Greater than/equal to 50|Male students performed well in math exam equally as female students.<br /><br />Maximum male students scored around **80** marks. **4 male** and **3 female** scored *100 marks* in math score.<br /><br />**Group A** and **Group E** had performed well. 
math score|Less than/equal to 50|Maximum Students whose mark is less than 50 are mostly in range of **35-50** score, below that range more female students are there (**22**) and few male students (**4**).<br /><br />**Group C** and **Group B** showed *bad* performance
|read score|Greater than/equal to 50|Female students performed well in reading compared to male.<br /><br />Score above **80** marks if we visualise, female students are **double** in count of male students.<br /><br />**14 female** and **3 male** scored *100 marks* in reading score.<br /><br />**Group E** and **Group D** had performed well
|read score|Less than/equal to 50|Maximum Students whose mark is less than 50 are mostly in range of **40-50** score.<br /><br />Minimum score is **17** marks, nobody receive zero mark.<br /><br />**Group B** showed *bad* performance compared to other group.
|write score|Greater than/equal to 50|Female students performed well in reading compared to male.<br /><br />Score above **80** marks if we visualise, female students are **triple** in count of male students.<br /><br />**13 female** and **only 1 male** scored *100 marks* in writing score. 
|write score|Less than/equal to 50|Maximum Students whose mark is less than 50 are mostly in range of **40-50** score.<br /><br />Minimum score is **10** marks, nobody receive zero mark.

#### Categorical vs Categorical Variable

In [None]:
plt.figure(figsize=(20,20))
for i,col in enumerate(categorical_cols):
  if col != 'gender':
    s = sns.catplot(x=col,col='gender',kind='count',data=stud_df,palette='tab10')
    s.set_xticklabels(rotation=90)
    s.set_xlabels(visible=False)
plt.show()

### Multivariate Analysis

In [None]:
sns.pairplot(stud_df,hue='gender')

## New Variables (Feature Engineering)

In [None]:
## Feature Engineering
# Total Score to evaluate overall performace
stud_df['overall_avg_score'] = stud_df[['math_score','read_score','write_score']].sum(axis=1)/3

## Overall performace category
score_bins = [0,40,50,80,100]
performance_bins=['Poor','Below Average','Above Average','Excellent']

stud_df['overall_performance'] = pd.cut(stud_df["overall_avg_score"],bins = score_bins, labels = performance_bins, right=True)

In [None]:
plots = sns.countplot(x='overall_performance', data= stud_df)
for bar in plots.patches:
    plots.annotate(format(bar.get_height(), '.0f'),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=12, xytext=(2, 4),
                   textcoords='offset points')
plt.title('Count Plot for Overall Student performace')
plt.show()

In [None]:
cat_num_plot('gender','overall_avg_score',stud_df)

In [None]:
sns.catplot(x='overall_performance',col='gender',kind='count',data=stud_df,palette='tab10')

## Observed Key Points:
1. Students pass performance is very good in exams because almost **70%** of the male and female students scored well in exams.
2. Students who have taken their **Test preparatory course** in sincere manner, achieved highest pass results in all exams and students who got fail their marks also nearby in range of 40 to 50 marks. **Below 50** score students most of them are **non-completion** of Preparatory test course.
3. **Female** students performed very well in Read and Write exams but in **Math few students require training** and **Boys** also nearly competitive along with girls but need **more attention towards Read and Write score**.
4. Students whose score below pass range if we provide little push/ extra coaching and suggest them to work on Preparatory Test course so that in the upcoming year performance may **increase**.

## Suggestions
From the above analysis of various level, students are categorised into four performace level
*  Excellent (>80)
*  Above Average (50-80)
*  Below Average (40-50)
*  Poor (<40)

#### Following are the suggestions observed from the dataset:

1. Keep promoting the importance about the **Test Performance Course** to students, it helps to identify their level and evaluated their skills.
2. Create competition group like **On The Spot/Quiz activity** and create team members from **various Ethniicity groups** because it helps students to interact with different graoups and share their thoughts through different competition.
3. Receive **Feedback** from students who scored around **40-50** observe their small gap and help them to gain performance based on their answers
4. For those who scored **below 50 marks**, suggest that extra coaching/ insist them to work on Test preparatory course. Because it is observed students who had enrolled and completed the Test prepartory course had very good performace.
5. Distribute Gifts/Rewards to students who scored **100** it ecourages their performace for further level.

## ------------------------Keep in touch and share your comments--------------
## -----Will further provide updates related to Model Building and Prediction-----