# STUDENTS PERFORMANCE ANALYSIS

### Context
Marks secured by the students
### Content
This data set consists of the marks secured by the students in various subjects.
### Acknowledgements
http://roycekimmons.com/tools/generated_data/exams
### Inspiration
To understand the influence of the parents background, test preparation etc on students performance

![](https://blog.amerlux.com/wp-content/uploads/2020/11/amlx-tw-education-blog-1024x683.jpg)

In [None]:
import pandas as pd
import numpy as np
import pandas_profiling as pp
from colorama import Fore, Back, Style


import seaborn as sb
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

In [None]:
df = pd.read_csv('../input/students-performance-in-exams/StudentsPerformance.csv')
data = df.copy(deep=True)
init_notebook_mode (connected = True)

In [None]:
df.head().style.set_properties(**{'background-color':'black','color':'white'})

In [None]:
df.info()

In [None]:
pp.ProfileReport(df)

In [None]:
df['score'] = ((df['math score']+ df['reading score'] + df['writing score'])/ 3).round(2)

In [None]:
fig = px.scatter(df, x="reading score", y="writing score", color="parental level of education",color_discrete_sequence=px.colors.qualitative.Plotly,
                 size='math score', hover_data=['gender'])
fig.update_layout(title='readign score vs writing score')
iplot(fig)

In [None]:
fig = px.density_heatmap(df, x="writing score", y="reading score", marginal_x="box", marginal_y="violin")
fig.update_layout(title='Density HeatMap between Writing Score and Reading Score of the students')
iplot(fig)

In [None]:
df['race/ethnicity'].value_counts()

In [None]:
x_data = np.unique(df['race/ethnicity'])

ya,yb,yc,yd,ye=[],[],[],[],[]

for i in range(len(df)):
    if df['race/ethnicity'][i] =='group C':
        yc.append(df['math score'][i])
    elif df['race/ethnicity'][i] == 'group D':
        yd.append(df['math score'][i])
    elif df['race/ethnicity'][i] == 'group B':
        yb.append(df['math score'][i])
    elif df['race/ethnicity'][i] == 'group E':
        ye.append(df['math score'][i])
    else:
        ya.append(df['math score'][i])


y_data = [ya,yb,yc,yd,ye]

colors = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)', 'rgba(44, 160, 101, 0.5)',
          'rgba(255, 65, 54, 0.5)', 'rgba(207, 114, 255, 0.5)', 'rgba(127, 96, 0, 0.5)']

fig = go.Figure()

for xd, yd, cls in zip(x_data, y_data, colors):
        fig.add_trace(go.Box(
            y=yd,
            name=xd,
            boxpoints='all',
            jitter=0.5,
            notched=True,
            whiskerwidth=0.2,
            fillcolor=cls,
            marker_size=2,
            line_width=1)
        )

fig.update_layout(
    title='math Score of the students based on race/ethnicity ',
    yaxis=dict(
        autorange=True,
        showgrid=True,
        zeroline=True,
        dtick=5,
        gridcolor='rgb(255, 255, 255)',
        gridwidth=1,
        zerolinecolor='rgb(255, 255, 255)',
        zerolinewidth=2,
    ),
    margin=dict(
        l=40,
        r=30,
        b=80,
        t=100,
    ),
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)',
    showlegend=False
)

iplot(fig)

In [None]:
print(Fore.BLACK+Back.RED+'pairwise scatter plot for continuous features')
sb.pairplot(df,hue='gender',kind='scatter')
plt.show()

In [None]:
for i in df.columns:
    if df[i].dtype == 'object':
        print(np.unique(df[i]))

In [None]:
for i in list(df.columns):
    if df[i].dtype == 'object':
        df[i]=pd.factorize(df[i])[0]

In [None]:
df.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='viridis')\
                             .background_gradient(subset=['50%'], cmap='mako')

In [None]:
plt.figure(dpi=150)
plt.title('correlation between attributes')
sb.heatmap(df.corr(),annot=True,lw=1,linecolor='white',cmap='viridis_r')
plt.xticks(rotation=90)
plt.yticks(rotation = 0)
plt.show()

In [None]:
pd.options.mode.chained_assignment = None
for i in range(len(df.score)):
    if df.score[i] >= 80:
        df.score[i] = 'excellent'
    elif df.score[i] >= 70:
        df.score[i] = 'good'
    elif df.score[i] >= 60:
        df.score[i] = 'satisfactory'
    elif df.score[i] < 60:
        df.score[i] = 'need support'
        
data['grade'] = df['score']

In [None]:
fig = px.histogram(data, x="grade", y="math score", color="gender", facet_col="race/ethnicity",marginal="box")
iplot(fig)

In [None]:
cL = list(df.columns)
features = list(set(cL)-set(['score']))
x = df[features]
y = df.loc[:,['score']]

In [None]:
train_x,test_x,train_y,test_y = train_test_split(x,y,random_state = 3,test_size = 0.25)

In [None]:
ran = RandomForestClassifier()
ran.fit(train_x,np.ravel(train_y,order='C'))
predictions = ran.predict(test_x)

In [None]:
print('Accuracy of the model is : ',Back.YELLOW+str(accuracy_score(test_y,predictions)))

In [None]:
print(Fore.BLACK+Back.RED+classification_report(test_y,predictions))

# <font color='blue'>Conclusion</font>
* ##  Boys are good at math and Girls are good at reading and writing.
* ## Group A students have least scores compared to other race.
* ## Reading ,Writing and Math scores have high positive correlation.


# <font color='red'>The Random forest classifier will classify the students based on the given attributes with more than 90% accuracy.</font>


## If you like, an upvote would be deeply appreciated. Thanks! :)