**An attempt to see the important factors that decides a students grade.**

Import the necessary packages necessary for manipulating,visualising,model building and measuring performance.

In [None]:

import numpy as np 
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
import matplotlib.pyplot as plt#for data visualisation
import seaborn as sns#for data visualisation
%matplotlib inline
from sklearn.model_selection import train_test_split#for dividing the data set into test set and training set
from sklearn.ensemble import RandomForestClassifier#For building Random Forest model
from sklearn import metrics                     #for measuring performance of the model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report#To see how well the predictions were classified correctly
from sklearn.preprocessing import StandardScaler


Load the dataset using pandas into a dataframe

In [None]:
data = pd.read_csv('../input/StudentsPerformance.csv')

Inspect the data 

In [None]:
data.head()

In [None]:
data.info()


In [None]:
data.gender=data.gender.astype('category')
data['race/ethnicity']=data['race/ethnicity'].astype('category')
data['parental level of education']=data['parental level of education'].astype('category')
data.lunch=data.lunch.astype('category')
data['test preparation course']=data['test preparation course'].astype('category')
#data.Result=data.Result.astype('category')
#data.Grade=data.Grade.astype('category')

In [None]:
data.isnull().sum()

In [None]:
print(data['parental level of education'].value_counts())
print(data.lunch.value_counts())
data['test preparation course'].value_counts()
data['race/ethnicity'].value_counts()
data.gender.value_counts()


In [None]:
print(data.lunch.unique())
print(data['parental level of education'].unique())
print(data['race/ethnicity'].unique())
print(data['test preparation course'].unique())



In [None]:
data.describe()

Visually exploring the data,comparing the factors.

In [None]:

sns.countplot(data.gender) 

In [None]:
sns.countplot(x=data['race/ethnicity'])

In [None]:
sns.catplot(y='math score',x='gender',kind='bar',data=data)

In [None]:
fig,ax=plt.subplots(figsize=(12,3))
sns.boxplot(x='parental level of education',y='math score',hue='gender',data=data)
ax.set_title('math score for different parental education and gender')
plt.show()  

In [None]:
fig,ax=plt.subplots(figsize=(12,3))
sns.boxplot(x='parental level of education',y='reading score',hue='gender',data=data)
ax.set_title('math score for different parental education and gender ')
plt.show() 

In [None]:
fig,ax=plt.subplots(figsize=(12,3))
sns.boxplot(x='parental level of education',y='writing score',hue='gender',data=data)
ax.set_title('math score for different parental education and gender')
plt.show() 

In [None]:

sns.catplot(y="parental level of education", hue="race/ethnicity", kind="count",
            palette="pastel", edgecolor=".6",
            data=data)

In [None]:
sns.catplot(y="race/ethnicity", hue="lunch", kind="count",
            palette="pastel", edgecolor=".6",
            data=data)

In [None]:
fig,ax=plt.subplots(figsize=(12,3))
sns.boxplot(x='test preparation course',y='math score',hue='gender',data=data)
ax.set_title('math score based on test preparation')
plt.show() 

In [None]:
fig,ax=plt.subplots(figsize=(12,3))
sns.boxplot(x='test preparation course',y='reading score',hue='gender',data=data)
ax.set_title('reading score based on test preparation')
plt.show() 

In [None]:
fig,ax=plt.subplots(figsize=(12,3))
sns.boxplot(x='test preparation course',y='writing score',hue='gender',data=data)
ax.set_title('writing score based on test preparation')
plt.show() 

In [None]:
fig,ax=plt.subplots(figsize=(12,3))
sns.boxplot(x='lunch',y='math score',hue='gender',data=data)
ax.set_title('math score based on lunch type')
plt.show() 

In [None]:
fig,ax=plt.subplots(figsize=(12,3))
sns.boxplot(x='lunch',y='reading score',hue='gender',data=data)
ax.set_title('reading score based on lunch type')
plt.show() 

In [None]:
fig,ax=plt.subplots(figsize=(12,3))
sns.boxplot(x='lunch',y='writing score',hue='gender',data=data)
ax.set_title('writing score based on lunch type')
plt.show() 

Result based score on different subjects.


In [None]:
data['Result'] = data.apply(lambda x : 'Fail' if x['math score'] <40 or 
                                    x['reading score'] < 40 or x['writing score'] <40 else 'Pass', axis =1)
data.head()

Assess the grade based on total score

In [None]:
def  grade(s1,s2,s3,res):
  if res=='Fail':
    return 'F'
  elif sum([s1,s2,s3],0)/3 >=80  :
    return 'A'
  elif sum([s1,s2,s3],0)/3>=60 and sum([s1,s2,s3],0)/3<80:
    return 'B'
  elif sum([s1,s2,s3],0)/3 >=40 and sum([s1,s2,s3],0)/3<60:
    return 'C'
  else:
      return 'F'  

In [None]:
data['Grade'] = data.apply(lambda row: grade(row['math score'], row['reading score'],row['writing score'],row['Result']), axis=1)
                           
data.head()

In [None]:
sns.countplot(data['Grade'],order=['A','B','C','F'])

In [None]:
sns.catplot(y="parental level of education", hue="Result", kind="count",
            palette="pastel", edgecolor=".6",
            data=data)

In [None]:
data.columns
sns.catplot(y="Result", hue="test preparation course", kind="count",
            palette="pastel", edgecolor=".6",
            data=data)


In [None]:
data.Result=data.Result.astype('category')
data.Grade=data.Grade.astype('category')

Model Building using Random Forest classifier.
1.Create Feature matrix and Target Vector.
2.convert categorical variables to numerical.




In [None]:
X = data.drop('Grade',axis = 1)
scale=StandardScaler()
scale.fit(X.loc[:,'math score':'writing score'])
X.loc[:,'math score':'writing score']=scale.transform(X.loc[:,'math score':'writing score'])
X=pd.get_dummies(X,drop_first=True)
X.head()



In [None]:
y=data['Grade']


3.Test-Train split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

4.create an instance of RF classifier ,fit the train data and use it to do prediction on test data

In [None]:
rf_clf=RandomForestClassifier(n_estimators=100)
#y_train.shape
#X_train.shape
rf_clf.fit(X_train,y_train)
y_pred=rf_clf.predict(X_test)
metrics.accuracy_score(y_test,y_pred)


create a series of feature importance starting from highest value.

In [None]:
feature_imp=pd.Series(rf_clf.feature_importances_,index=X_train.columns).sort_values(ascending=False)
feature_imp

Visualising the feature importance

In [None]:
sns.barplot(x=feature_imp,y=feature_imp.index)
plt.title('Feature Importance')
plt.xlabel('score')
plt.ylabel('Features')



To see how well the results were classified.

In [None]:
print(confusion_matrix(y_test,y_pred))


In [None]:

class_rep=classification_report(y_test,y_pred)
print(class_rep)