In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df= pd.read_csv("/kaggle/input/students-performance-in-exams/StudentsPerformance.csv")
df.head()

**Data Assessment**

In [None]:
df.shape # Finding the number of rows and columns in your dataset

In [None]:
df.info() #finding info about all the columns in our dataset

In [None]:
df.isnull().sum() # finding out how many null vlaues exist for each particular column.

**Data Cleaning and add new features**

Cleaning steps:
* Renaming race/ethnicity as race
* Renaming parental level of education as parent_education
* Renamning test preparation course as test_preparation_course
* Renaming math score ,reading score ,writing score as maths_score,reading_score and writing_score respectively

New Columns to be added in the dataset
*  Total Score of each student
*  Percentage of each student
*  Pass/Fail status of each student

In [None]:
#creating a copy of original dataset
df_clean=df.copy() 
df_clean.head()

In [None]:
#renaming the columns
df_clean.rename(columns = {"race/ethnicity": "race", 
                           "parental level of education":"parent_education", 
                           "test preparation course": "test_preparation_course",
                            "math score":"maths_score",
                            "reading score":"reading_score",
                            "writing score":"writing_score"},inplace=True)
df_clean.head()


In [None]:
#total score of each student
df_clean['Total_score']= df_clean['maths_score']+df_clean['writing_score']+df_clean['reading_score']
df_clean.info()

In [None]:
#percentage of each student
df_clean['percentage']=df_clean['Total_score']/3
df_clean.info()

In [None]:
#pass/fail result of each student
def result(percentage):
    if percentage >=33:
        return "PASS"
    else:
        return "FAIL"
    
df_clean['result']=df_clean['percentage'].apply(result)
print(df_clean.info())

In [None]:
df_clean.head()

*  What is the structure of your dataset?

    This dataset contains information about the marks scored by students in United States in various subjects such as      maths,reading and writing Scores. Various other features are also included in the dataset such as parental level      of education, whether or not they completed the test course or not and which race/ethnicity or which gender they      belong to. Various new features were also added to support the anaylsis such as total score , percentage and 
      the final result of the student.
      

*  What is/are the main feature(s) of interest in your dataset?

    The various features that can help to carry out this investigation will be gender, race, parent_education ,          marks in all the subjects and the final result of the student.

Various questions that can be addressed in the anaylsis
*  Which gender performs better in all the test results?   
*  Does parental education plays an impotant role in determining score results of students?   
*  Which class or group of society perfroms better in the exams?                    
*  Which other factors such as lunch or test completion affect the students scores?     
*  What are the average marks in all subjects and how many students have actually passed or failed?

**Data Visualization**

 **Average marks in all three subjects which are maths,reading and writing**

In [None]:
def hist_box(subject): #function to plot histogram and boxplot of given feature
    avg= str(df_clean[subject].mean())
    print("Average in {sub} is".format(sub=subject)+" "+avg)
    plt.figure(figsize=(10,4), dpi=100) 
  
    
    plt.subplot(1, 2, 1);
    plt.hist(df_clean[subject]);
    plt.xlabel("{sub}".format(sub=subject));
    plt.ylabel("Frequency of marks");
    plt.title("Distribution of {sub}".format(sub=subject));


    plt.subplot(1, 2, 2);
    plt.boxplot(df_clean[subject],patch_artist=True);
    plt.xlabel("{sub}".format(sub=subject));
    plt.ylabel("Marks");
    plt.title("Distribution of {sub}".format(sub=subject));
    plt.show();
    
hist_box("maths_score") # histogram and box-plot of maths marks
hist_box("reading_score") # histogram and box-plot of reading marks
hist_box("writing_score") # histogram and box-plot of writing marks

**Anaylsis of other categorical variables**

In [None]:
# function to plot bar plot for passed categrical variable.
def categorical_plot(variable):
    plt.figure(figsize=(6,4), dpi=80)
    count1=df_clean.groupby(variable).size().sort_values()
    print(count1)
    sns.barplot(y=count1.index, x=count1,data=df_clean)
    plt.xlabel("Frequeny")
    plt.ylabel("{vari}".format(vari=variable))
    plt.title(variable)
    plt.show()

# Anaylsis of parent level of education
categorical_plot("parent_education")

# anaylsis of race/ethnicity of the students
categorical_plot('race')

# anaylsis of how many students practiced test_preparation course and the students who didn't.
categorical_plot('test_preparation_course')

**Anaylsis of Results of Students**

In [None]:
# anaylsing how many students have passed and failed
plt.figure(figsize=(8,4), dpi=80)
labels=['Pass','Fail']
pass_student=(df_clean[df_clean['result']=='PASS']['result'].count())*100/df_clean.shape[0]
fail_student=(df_clean[df_clean['result']=='FAIL']['result'].count())*100/df_clean.shape[0]
print("Percentage of students who have passed "+str(pass_student))
print("Percentage of students who have failed "+str(fail_student))
ex=(0.1,0.2)
result1=[pass_student,fail_student]
plt.pie(result1,labels=labels,autopct='%1.1f%%',shadow=True,explode=ex);
plt.title("Pass/Fail Percentage");

The main features of interest in this dataset is gender, race and parental level of education and whether or not the student completed the test course or has taken none. These are the important factors in deciding how the students perform in their examinations. No in univariate explorations there was no need to apply any transformations.Also,we can observe that average marks in each subject lies around 65-70 marks and majority of students have passed in all the subjects.

**Anaylsis of marks scored by each gender in each subject**

In [None]:
    plt.figure(figsize=(14,5), dpi=100)
    plt.subplot(1, 3, 1);
    
    #plot for maths score
    count=df_clean.groupby('gender')['maths_score'].mean()
    print("Maths Score Average based on Gender")
    print(count)
    print()
    sns.boxplot(x='gender', y='maths_score',data=df_clean)
    plt.title("Maths Score according to gender");

    #plot for reading score
    plt.subplot(1, 3, 2);
    sns.boxplot(x='gender', y='reading_score',data=df_clean)
    count=df_clean.groupby('gender')['reading_score'].mean()
    print("Reading Score Average based on Gender")
    print(count)
    print()
    plt.title("Reading Score according to gender");
    
    #plot for writing score
    plt.subplot(1, 3, 3);
    sns.boxplot(x='gender', y='writing_score',data=df_clean)
    count=df_clean.groupby('gender')['writing_score'].mean()
    print("Writing Score Average based on Gender")
    print(count)
    print()
    plt.title("writing score according to gender");
    
    plt.show();

As we can see from above plot average male gender tends to perform better in maths as compared to reading and writing as compared to female gender. The average marks obtained by female gender is greater than average score obtained by male gender in both reading and writing.

**Corrleation between Scores in different Subjects**

In [None]:
plt.figure(figsize=(14,6), dpi=100)
plt.subplot(1, 3, 1);
    
sns.regplot(x='maths_score', y='reading_score',data=df_clean)
plt.xlabel("Maths Score");
plt.ylabel("Reading Score")
plt.title("Maths Score V/S Reading Score");


plt.subplot(1, 3, 2);
sns.regplot(x='maths_score', y='writing_score',data=df_clean)
plt.xlabel("Maths Score")
plt.ylabel("Writing Score")
plt.title("Maths Score V/S Writing Score");
    
plt.subplot(1, 3, 3);
sns.regplot(x='reading_score', y='writing_score',data=df_clean)
plt.xlabel("Reading Score")
plt.ylabel("Writing Score")
plt.title("Reading Score V/S Writing Score");
plt.show();

It can be concluded from above plot that all scores tend to follow a positive correlation with each other, if marks in one subject increases the marks in other subject also increases.

**Distribution of Scores Based on Race/Ethincity**

In [None]:
    plt.figure(figsize=(14,5), dpi=100)
    color=sns.color_palette()[0]
    
    #plot for maths score
    plt.subplot(1, 3, 1);
    count=df_clean.groupby('race')['maths_score'].mean().sort_values()
    print("Maths Score Average based on Race/Ethnicity")
    print(count)
    print()
    sns.pointplot(x=count.index, y=count,data=df_clean,color=color)
    plt.title("Maths Score according to race");

    # plot for reading score
    plt.subplot(1, 3, 2);
    count=df_clean.groupby('race')['reading_score'].mean().sort_values()
    print("Reading Score Average based on Race/Ethncity")
    print(count)
    print()
    sns.pointplot(x=count.index, y=count,data=df_clean,color=color)
    plt.title("Reading Score according to race");
    
    #plot for writing score
    plt.subplot(1, 3, 3);
    count=df_clean.groupby('race')['writing_score'].mean().sort_values()
    print("Writing Score Average based on Race/Ethnicity")
    print(count)
    print()
    sns.pointplot(x=count.index, y=count,data=df_clean,color=color)
    plt.title("Writing score according to race");
    
    plt.show();

We can see from visualised plot that students belonging to group E tend to perform better in all subjects ,then comes group D followed by group A,B and C.We can see an observable amount of difference in marks of students of group E and group A

**Distribution of scores in all subjects based on test-prepration course**

In [None]:
    plt.figure(figsize=(14,5), dpi=100)
    color=sns.color_palette()[0]
    
    #plot for maths score
    plt.subplot(1, 3, 1);
    count=df_clean.groupby('test_preparation_course')['maths_score'].mean().sort_values()
    print("Maths Score V/s test_preparation_course")
    print(count)
    print()
    sns.barplot(x=count.index,y=count,data=df_clean,color=color)
    plt.title("Maths Score V/s test_preparation_course");

    #plot for reading score
    plt.subplot(1, 3, 2);
    count=df_clean.groupby('test_preparation_course')['reading_score'].mean().sort_values()
    print("Reading Score Average V/s test_preparation_course")
    print(count)
    print()
    sns.barplot(x=count.index, y=count,data=df_clean,color=color)
    plt.title("Reading Score V/s test_preparation_course");

    # plot for writing score
    plt.subplot(1, 3, 3);
    count=df_clean.groupby('test_preparation_course')['writing_score'].mean().sort_values()
    print("Writing Score V/s test_preparation_course")
    print(count)
    print()
    sns.barplot(x=count.index, y=count,data=df_clean,color=color)
    plt.title("Writing score V/s test_preparation_course");
    
    plt.show();
    

The conclusion that can be drawn from above plot is that students who have taken the test_preparation course tend to perform better than those students who have not taken the test preparation course.

In this section of exploration various interesting observations can be drawn out. Firstly female gender tend to perfrom better in reading and writing whereas male gender tend to perform better in subject of maths. Secondly scores in all subjects tend to have a positive correlation among each other. We can also see a gap in scores between students belonging to each grup/race and scores are also impacted by whether or not the student has taken the preparation course or not.

**Average scores in each subject based on gender and race**

In [None]:
    plt.figure(figsize=(14,5), dpi=100)
    color=sns.color_palette()[0]
    
    #plot for maths score
    plt.subplot(1, 3, 1);
    count=df_clean.groupby(['gender','race'],as_index=False)['maths_score'].mean().sort_values(by='maths_score')
    print("Maths Score by gender and race")
    print(count)
    print()
    sns.pointplot(x='gender',y='maths_score',hue='race',data=count)
    plt.title("Maths Score by gender and race");
    plt.legend(loc=2)


    # plot for reading score
    plt.subplot(1, 3, 2);
    count=df_clean.groupby(['gender','race'],as_index=False)['reading_score'].mean().sort_values(by='reading_score')
    print("Reading Score Average by gender and race")
    print(count)
    print()
    sns.pointplot(x='gender', y='reading_score',hue='race',data=count)
    plt.title("Reading Score by gender and race");
    plt.legend(loc=2)
    
    #plot for writing score
    plt.subplot(1, 3, 3);
    count=df_clean.groupby(['gender','race'],as_index=False)['writing_score'].mean().sort_values(by='writing_score')
    print("Writing Score by gender and race")
    print(count)
    print()
    sns.pointplot(x='gender', y='writing_score',hue='race',data=count)
    plt.title("Writing score by gender and race");
    plt.legend(loc=2)
    
    plt.show();
    

The conclusion that can be drawn from above plot females belonging to group E tend to perform better in reading and writing as compared to males belonging to Group E . Males from all groups perfroms better in maths as compared to females of same group. The order of scores obtained can be summarized as follows: Group E > Group D > Group C > Group B > Group A

**Average Scores by test preparation course and Pass/Fail Result**

In [None]:
plt.figure(figsize=(14,5), dpi=100)
color=sns.color_palette()[0]
    
# plot for maths score
plt.subplot(1, 3, 1);
count=df_clean.groupby(['test_preparation_course','result'],as_index=False)['maths_score'].mean().sort_values(by='maths_score')
print("Maths Score by test course and result")
print(count)
print()
sns.violinplot(x='result',y='maths_score',hue='test_preparation_course',data=df_clean,inner='quartile',split=True)
plt.title("Maths Score by test course and result");
plt.legend(loc=2)


# plot for reading score
plt.subplot(1, 3, 2);
count=df_clean.groupby(['test_preparation_course','result'],as_index=False)['reading_score'].mean().sort_values(by='reading_score')
print("Reading Score Average by test course and result")
print(count)
print()
sns.violinplot(x='result', y='reading_score',hue='test_preparation_course',data=df_clean,inner="quartile",split=True)
plt.title("Reading Score by test course and result");
plt.legend(loc=2)

#plot for writing score
plt.subplot(1, 3, 3);
count=df_clean.groupby(['test_preparation_course','result'],as_index=False)['writing_score'].mean().sort_values(by='writing_score')
print("Writing Score by test course and result")
print(count)
print()
sns.violinplot(x='result', y='writing_score',hue='test_preparation_course',data=df_clean,inner="quartile",split=True)
plt.title("Writing score by test course and result");
plt.legend(loc=2)
    
plt.show();
    

As it can be visualised from above plot that no student that have completed the test preparation course has failed. Only those have failed in their examinations who didn't take up test preparation course. The test preparation course seemed to be an deciding factor in determining the result of the student.It can also be noted that students who have taken the test preparation course tend to have scored higher marks in all subjects.

**Total Score and percentage by result and parental education**

In [None]:
plt.figure(figsize=(14,5), dpi=100)
plt.subplot(1,2,1)
count=df_clean.groupby(['parent_education','result'],as_index=False)['Total_score'].mean().sort_values(by='Total_score');
print("Results by total score and parent education")
print(count)
print()
sns.barplot(x='result',y='Total_score',hue='parent_education',data=count);
plt.title("Results by total score and parent education");

plt.subplot(1,2,2)
count=df_clean.groupby(['parent_education','result'],as_index=False)['percentage'].mean().sort_values(by='percentage');
print()
print("Results by Percentage and parent education")
print(count)
sns.barplot(x='result',y='percentage',hue='parent_education',data=count);
plt.title("Results by Percentage and parent education");

It can visualised from above plot that students whose parents have completed masters/bachelors degree in education have not failed in examinations and highest total score is obtained by students whose parents have completed master's degree followed by students whose parents have completed bachelors degree . Lowest total score and percentage have been recorded by students whose parents have gone to some high school or just completed high school

**Conclusion**

From the above analysis carried out on Student performance data it can be concluded that:

* Male gender tends to perform better in subject maths and female gender tends to perform better in other subjects.  

* Group E students tend to perform better in all subjects as compared to other groups and Group A students              performance is the lowest among all other groups. 

* Out of 1000 students only 15 students have failed making the fail percentage as 1.5% and pass percentage 98.5%.

* Test Preparation course also plays a major role in deciding the result of the student.Every student who has            completed the test preparation course have passed in all subjects. All those students have failed who have not        taken the test preparation course.  

* Parent education can also be considered as an important criteria in determining the results of the students.          Students whose parents have completed master's or bachleor's degree have not failed in any subject. Lowest pass       percentage is recorded by students whose parents have just gone to some high school.