### Content: 

1. [Load and Check Data](#1)
2. [Variable Description](#2)
3. [Missing Values](#3)
4. [Outlier Detection](#4)
5. [What I Wonder](#5)
    * [Score difference between male and female students](#5.4)
    * [The relationship between parental education level and student scores](#5.5)
    * [Male student and female student parental education level](#5.1)
    * [Whether the race/ethnicity go to the test preparation course](#5.2)
6. [Conculusion](#6) 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
%matplotlib inline
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from plotly.offline import init_notebook_mode, iplot, plot
import plotly as py
init_notebook_mode(connected=True)
import plotly.graph_objs as go

from wordcloud import WordCloud


import warnings
warnings.filterwarnings('ignore') 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id='1'></a>
## Load and Check Data

In [None]:
data=pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')
plt.subplots(figsize=(8,8))
wordcloud=WordCloud(
    background_color='white',
    width=512,
    height=384).generate(' '.join(data))

plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('graph.png')
plt.show()


In [None]:
data.info()

In [None]:
data.columns

In [None]:
data.describe()

In [None]:
data.head()

In [None]:
data['race/ethnicity'].unique()

In [None]:
data['lunch'].unique()

In [None]:
data['parental level of education'].unique()

In [None]:
data['test preparation course'].unique()

In [None]:
plt.figure(figsize=(10,6))
plt.title('Correlation')
sns.heatmap(data.corr(),annot=True,cmap='viridis',linecolor='white')
plt.xticks(rotation=90)
plt.yticks(rotation=90)
plt.show()

All grades are linear, meaning they score well in other subjects that scored well in one course.
* +1==linear
* -1==unlinear
* 0==meaningless

<a id='2'></a>
## Variable Description

* gender: gender of the student
* parental level of education:student's education level of parents
* lunch: price level of students' lunches.
* test preparation course : whether students attend exam preparation courses
* math score:
* reading score:	
* writing score:
* race/ethnicity: student's race / ethnicity class

<a id='3'></a>
## Missing Values

In [None]:
data.isnull().sum()

#### We don't have any missing values.

<a id='4'></a>
## Outlier Detection

In [None]:
def outlier_detection(df,columns):
    outlier_indices=[]
    for x in columns:
        Q1=np.percentile(df[x],25)
        Q3=np.percentile(df[x],75)
        IQR=Q3-Q1
        # Outlier step
        outlier_step=IQR*1.5
        # detect outlier and their indeces
        outlier_list_col=df[(df[x]<Q1-outlier_step) | (df[df[x]>outlier_step+Q3])].index
        outlier_indices.extend(outlier_list_col)
    
    outlier_indices=Counter(outlier_list_col)
    
    multiple_outliers=list(i for i, v in outlier_indices.items() if v>1)
    
    return multiple_outliers


In [None]:
data.loc[outlier_detection(data,['math score','reading score','writing score'])]

#### Good for us we don't have any outliers.

<a id='5'></a>
## What I Wonder


<a id='5.4'></a>
### Score difference between male and female students

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x=data['gender'],y=data['math score'])
plt.xticks(rotation=90)
plt.xlabel('Gender')
plt.ylabel('Math Score')
plt.title('Score difference between male and female students')
plt.show()

plt.figure(figsize=(10,6))
sns.violinplot(x=data['gender'],y=data['math score'],split=True)
sns.despine(left=True)
plt.show()

sns.swarmplot(x=data['gender'],y=data['math score'])
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x=data['gender'],y=data['reading score'])
plt.xticks(rotation=90)
plt.xlabel('Gender')
plt.ylabel('Reading Score')
plt.title('Score difference between male and female students')
plt.show()

plt.figure(figsize=(10,6))
sns.violinplot(x=data['gender'],y=data['reading score'],split=True)
sns.despine(left=True)
plt.show()

sns.swarmplot(x=data['gender'],y=data['reading score'])
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x=data['gender'],y=data['writing score'])
plt.xticks(rotation=90)
plt.xlabel('Gender')
plt.ylabel('Writing Score')
plt.title('Score difference between male and female students')
plt.show()

plt.figure(figsize=(10,6))
sns.violinplot(x=data['gender'],y=data['writing score'])
plt.show()

sns.swarmplot(x=data['gender'],y=data['writing score'])
plt.show()

In [None]:
trace0=go.Box(
    y=data['math score'],
    name='Math Score',
    marker=dict(color='rgb(12,12,140)',
)
)
trace1=go.Box(
    y=data['writing score'],
    name='Writing Score',
    marker = dict(
        color = 'rgb(12, 128, 128)',
    )
)
trace2=go.Box(
    y=data['reading score'],
    name='Reading Score',
    marker = dict(
        color = 'rgb(12, 105, 130)',
    )
)

data2=[trace0,trace1,trace2]
iplot(data2)

The average of female students is higher except for mathematics.

<a id='5.5'></a>
### The relationship between parental education level and student scores


In [None]:
parental_education_list=list(data['parental level of education'].unique())
math_scores=[]
for i in parental_education_list:
    a=data[data['parental level of education']==i]
    math_score_rate=sum(a['math score'])/len(a)
    math_scores.append(math_score_rate)
    
data2=pd.DataFrame({'parental education list':parental_education_list,'math_score':math_scores})
new_index = (data2['math_score'].sort_values(ascending=True)).index.values
sorted_data2=data2.reindex(new_index)

plt.figure(figsize=(15,10))
sns.barplot(x=sorted_data2['parental education list'],y=sorted_data2['math_score'])

plt.xlabel('Parental Level of Education')
plt.ylabel('Math Scores')
plt.title("The relationship between parental education level and student scores")
plt.show()

The success of the student increases according to the education level of the family.

<a id='5.1'></a>
### Male student and female student parental education level??

In [None]:
data=pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')
data_female=data[data['gender']=='female']
data_male=data[data['gender']=='male']

trace1=go.Bar(
    x=data_female['parental level of education'],
    y=data_female.value_counts(),
    name='Female',
    marker=dict(color='rgba(255,174,255,0.5)',line=dict(color='rgb(0,0,0)',width=0.01)),
)

trace2=go.Bar(
    x=data_male['parental level of education'],
    y=data_male.value_counts(),
    name='Male',
    marker=dict(color='rgba(255,255,128,0.5)',line=dict(color='rgba(0,0,0)',width=0.01)),
)

data=[trace1,trace2]
layout=go.Layout(barmode='group')
fig=go.Figure(data=data,layout=layout)
iplot(fig)



<a id='5.2'></a>
### Whether the race/ethnicity go to the test preparation course

In [None]:
data=pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')

fig={
    'data':[
        {
            'values':data[data['test preparation course']=='none'].value_counts(),
            'labels':data['race/ethnicity'],
            'domain':{'x':[0,.5],},
            'hoverinfo':'label+percent',
            'hole':0.3,
            'type':'pie'
        },
    ],
    'layout':{
        'title':'Nones',
        'annotations':[
            {
                'font':{'size':20},
                'showarrow':False,
                'text':'Pie',
                'x':0.5,
                'y':1
            }
        ]
    }
}
iplot(fig)

In [None]:
data=pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')

fig={
    'data':[
        {
            'values':data[data['test preparation course']=='completed'].value_counts(),
            'labels':data['race/ethnicity'],
            'domain':{'x':[0,.5],},
            'hoverinfo':'label+percent',
            'hole':0.3,
            'type':'pie'
        },
    ],
    'layout':{
        'title':'Completeds',
        'annotations':[
            {
                'font':{'size':20},
                'showarrow':False,
                'text':'Pie',
                'x':0.5,
                'y':1
            }
        ]
    }
}
iplot(fig)

<a id='6'></a>
## Conculusion
Please vote if you liked.