# Dataset

In [88]:
import pandas as pd
data1=pd.read_csv('../input/students-performance-in-exams/StudentsPerformance.csv') #It reads the data.
                            #The dataset is taken from kaggle
         #https://www.kaggle.com/spscientist/students-performance-in-exams
data1.head() #displays the columns and top 5 rows of the dataset.

In [89]:
data=data1.copy() #It copy the dataset into data
data.head()  #It displays the columns and top 5 rows of the dataset.

In [90]:
data.isna().sum() #shows if there is any missing

In [91]:
#Converting string to numeric data
#gender
data=data.replace({'male':1,'female':0})
#race/ethinic group
data= data.replace({'group A':0, 'group B':1, 'group C':2, 'group D':3, 'group E':4})
#lunch
data= data.replace({'free/reduced':0, 'standard':1})
#test preparation course
data= data.replace({'none':0, 'completed':1})
#parents education
data= data.replace({'some college':3, "associate's degree":2, 'high school':4, 
                 'some high school':5, "bachelor's degree":1, "master's degree":0})

In [92]:
data

In [93]:
# This column is not required for Student Performance
data=data.drop(['lunch'], axis=1)

In [94]:
data.describe()

# Visualizing the data

In [95]:
%matplotlib inline
import matplotlib.pyplot as plt

import seaborn as sns
sns.countplot(data1['gender']) # displays number of male and female in dataset

In [96]:
# displays the count of different race/ethnicity in dataset
sns.countplot(data1['race/ethnicity']) 

In [97]:
# displays count of student that completed/none the test prepration course
sns.countplot(data1['test preparation course']) 

In [98]:
#This will show relation between two columns with respect to gender
sns.pairplot(data1,hue='gender')

# Pre-Processing the Data

In [99]:
# Total number of male and female in a dataset
data['gender'].value_counts()

In [100]:
data['average_score']=(data['math score']
                +data['reading score']+data['writing score'])//3
data.average_score.value_counts()
data.head()

# Assigning X and Y variable

In [101]:
# X and y -dependent variable and independent variable
y=data['average_score'] #independent variables
#dependent variables
X=data[data.columns.difference(['average_score'])]
X.head()

# labelling y variable

In [102]:
labels=['Fail','Pass']
bins=[0,75,100]
y=pd.cut(y,bins,labels=labels)
y.head()

# Scaling the X variable

In [103]:
#Feature Scaling
#Normalization and Standardization of Data
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
cols=X.columns
minmax_scale=scaler.fit(X[cols])
X[cols]=pd.DataFrame(minmax_scale.transform(X[cols]),columns=cols)
X.head()

# Train-Test Splitting

In [104]:
#Splitting the dataset into test-train model.

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,
                                        y,test_size=0.3,random_state=42)
print("Length of training Dataset: ",len(y_train))
print("Length of testing Dataset: ",len(y_test))

In [105]:
y_train.head(),y_test.head()

# Decision Tree Classifier

In [106]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
model=DecisionTreeClassifier (max_leaf_nodes=None,random_state=42)
model.fit (X_train,y_train) # Training the model
score=model.score (X_test,y_test) # Test accuracy and performance
print('Accuracy score of Decision Tree Classifier: %f'% (score*100))
predict=model.predict(X_test)
scores=cross_val_score(model,X,y,cv=10)
print("\nDecision Tree Classifier: "+"cross_val_score: "+
      str(np.mean(scores)*100))

accuracy=accuracy_score(y_test,predict)


precision=precision_score(y_test,predict,average='micro')


recall=recall_score(y_test,predict,average='micro')


precision=precision_score(y_test,predict,average='micro')


f1=f1_score(y_test,predict,average='micro')


cm=confusion_matrix(y_test,predict)
print('\nConfusion matrix\n',(cm))

print('\nClassification Report:')
print(classification_report(y_test,predict))

# Classification report

In [107]:
#To predict if the student fail or pass 
import numpy as np
pred=np.array(X_test.loc[522,:]).reshape(1,-1)
prediction=model.predict(pred)[0]
print("You %s "% prediction)

In [108]:
# Printing the dataset shape
print ("Dataset Length: ", len(data))
print("Dataset Shape",data.shape)

# Logistic Regression


In [109]:
import pandas as pd
import numpy as np
from sklearn import metrics 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
model=LogisticRegression(solver='liblinear', C=1.0,random_state=0)
model.fit(X_train,y_train)
#prediction on test dataset
#y_pred=model.
y_pred=model.predict(X_test)
predictor=model.score(X_test,y_test)
#print(y_pred)
print("Accuracy of the Logistic Regression model:  %f"%(predictor*100))

c1 =confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix: \n",c1)

print("\nclassification_report:\n")
print(classification_report(y_test, y_pred))

# Linear Regression 

In [110]:
# X and y -dependent variable and independent variable
y=data['average_score'] #independent variables
#dependent variables
X=data[data.columns.difference(['average_score'])]
#X.head()

In [111]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=2)
reg=LinearRegression().fit(X_train,y_train)

In [112]:
reg.score(X_train,y_train)

In [113]:
reg.score(X_test,y_test)

In [114]:
 #Testing Underfitting and Overfitting
from sklearn import linear_model
lasso_reg= linear_model.Lasso(alpha=50, 
                    max_iter=100, tol=0.1)
lasso_reg.fit(X_train,y_train)

In [115]:
lasso_reg.score(X_test,y_test)

In [116]:
lasso_reg.score(X_train,y_train)

In [117]:
from sklearn.linear_model import Ridge
ridge_reg= Ridge(alpha=50, max_iter=100, tol=0.1)
ridge_reg.fit(X_train,y_train)

In [118]:
ridge_reg.score(X_test,y_test)

In [119]:
ridge_reg.score(X_train,y_train)

In [120]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import random
def f(m, X, c):
#"""Linear regression"""
    return [m * x + c for x in X]
X = [i for i in range(50)]
y = [x + random.random() for x in X]
m, c = 1, 0
y_hat = f(m, X, c)
plt.plot(X, y, '.', c='r')
plt.plot(X, y_hat)
plt.show()

In [121]:
fig, ax = plt.subplots(figsize=(8, 8)) 
ax.imshow(c1)
ax.grid(False) 
ax.set_xlabel('Predicted outputs', fontsize=10, color='black') 
ax.set_ylabel('Actual outputs', fontsize=10, color='black')
ax.xaxis.set(ticks=(0,1),ticklabels=('Predicted 0s', 'Predicted 1s'))
ax.yaxis.set(ticks=(0,1),ticklabels=('Actual 0s', 'Actual 1s')) 
ax.set_ylim(1.5, -0.5)
for i in range(2): 
    for j in range(2): 
        ax.text(j, i, c1[i, j], ha='center', va='center', color='red') 
plt.show()

# Correlation's

In [122]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [123]:
corr_matrix=data.corr()
top_corr=corr_matrix.index
plt.figure(figsize=(10,10))
#plot heat map
g=sns.heatmap(data[top_corr].corr(),annot=True)

# Random Forest Classification

In [124]:
#Importing Data 
import pandas as pd
data=pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')
data=data.replace({'male':1,'female':0})
#race/ethinic group
data= data.replace({'group A':0, 'group B':1, 'group C':2, 'group D':3, 'group E':4})
#lunch
data= data.replace({'free/reduced':0, 'standard':1})
#test preparation course
data= data.replace({'none':0, 'completed':1})
#parents education
data= data.replace({'some college':3, "associate's degree":2, 'high school':4, 
                 'some high school':5, "bachelor's degree":1, "master's degree":0})
data=data.drop(['lunch'], axis=1)
data['average_score']=(data['math score']
                +data['reading score']+data['writing score'])//3


In [125]:
y=data['average_score'] #independent variables
#dependent variables
X=data[data.columns.difference(['average_score'])]
labels=['Fail','Pass']
bins=[0,75,100] #range 0>75 , 75>100
y=pd.cut(y,bins,labels=labels)

In [126]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
cols=X.columns
minmax_scale=scaler.fit(X[cols])
X[cols]=pd.DataFrame(minmax_scale.transform(X[cols]),
                columns=cols)
X.head()

In [127]:
y=y.replace({'Fail':0, 'Pass':1})

In [128]:
from sklearn.model_selection import train_test_split
# implementing train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                y, test_size=0.30, random_state=66)
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
# random forest model creation
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
# predictions
rfc_predict = rfc.predict(X_test)

In [129]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
rfc_cv_score = cross_val_score(rfc, X, y, cv=10, scoring='roc_auc')
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

# Visualizing Random forest

In [130]:
import pandas as pd
data=pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')
data=data.replace({'male':1,'female':0})
#race/ethinic group
data= data.replace({'group A':0,
        'group B':1, 'group C':2, 'group D':3, 'group E':4})
#lunch
data= data.replace({'free/reduced':0, 'standard':1})
#test preparation course
data= data.replace({'none':0, 'completed':1})
#parents education
data= data.replace({'some college':3, "associate's degree":2, 'high school':4, 
             'some high school':5, "bachelor's degree":1, "master's degree":0})

In [131]:
# Use numpy to convert to arrays
import numpy as np
# Labels are the values we want to predict
labels = np.array(data['lunch'])
# Remove the labels from the features
# axis 1 refers to the columns
data= data.drop('lunch', axis = 1)
# Saving feature names for later use
feature_list = list(data.columns)
# Convert to numpy array
data = np.array(data)

In [132]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features,train_labels, test_labels=train_test_split(X, 
                                y, test_size = 0.30, random_state = 42)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

In [133]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature,
    round(importance, 2)) for feature,
                importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances,
                        key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair))
                         for pair in feature_importances];

# Unsupervisd Learning Alogrithm

In [134]:
# Import matplotlib for plotting and use magic command for Jupyter Notebooks
import matplotlib.pyplot as plt
%matplotlib inline
# Set the style
plt.style.use('fivethirtyeight')
# list of x locations for plotting
x_values = list(range(len(importances)))
# Make a bar chart
plt.bar(x_values, importances, orientation = 'vertical')
# Tick labels for x axis
plt.xticks(x_values, feature_list, rotation='vertical')
# Axis labels and title
plt.ylabel('Importance'); 
plt.xlabel('Variable'); plt.title('Variable Importances');

# K means Clustering

In [135]:
#import mglearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.cluster import KMeans

In [136]:
data=pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')

In [137]:
data['average_score']=(data['math score']+data['reading score']
                       +data['writing score'])//3

In [138]:
# Taking the score and average score for the model
x=data.iloc[:,5:9]

In [139]:
kmeans=KMeans(4)
kmeans.fit(x)

In [140]:
#table shows the  numeric data to display
x

In [141]:
pip install mglearn

In [142]:
#Based on the above table plot on the axis displaying subject scores and average score
import mglearn
X=data.iloc[:,5:7].values
y=data.iloc[:,8].values
mglearn.discrete_scatter(X[:, 0], X[:, 1],
                kmeans.labels_, markers='o')
mglearn.discrete_scatter(kmeans.cluster_centers_[:, 0], 
            kmeans.cluster_centers_[:, 1], [0, 1, 2,3],
markers='^', markeredgewidth=2)

In [143]:
import mglearn
%matplotlib inline
import matplotlib.pyplot as plt
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
# using two cluster centers:
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)
assignments = kmeans.labels_
mglearn.discrete_scatter(X[:, 0], X[:, 1], assignments, ax=axes[0])
# using five cluster centers:
kmeans = KMeans(n_clusters=5)
kmeans.fit(X)
assignments = kmeans.labels_
mglearn.discrete_scatter(X[:, 0], X[:, 1], assignments, ax=axes[1])

In [144]:
from sklearn import preprocessing
x_scaled=preprocessing.scale(x)
x_scaled

In [145]:
wcss=[]
for i in range(1,15):
    kmeans = KMeans(i)
    kmeans.fit(x_scaled)
    wcss.append(kmeans.inertia_)


plt.plot(range(1,15),wcss,"-o")
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS Value")
plt.show()

# Dendogram

In [146]:
X=data.iloc[:,5:7].values
y=data.iloc[:,8].values

In [147]:
from scipy.cluster.hierarchy import dendrogram, linkage
Z=linkage(X,'ward')
dendrogram(Z)
ax = plt.gca()
bounds = ax.get_xbound()
ax.plot(bounds, [525, 525], '--', c='k')
ax.plot(bounds, [325, 325], '--', c='k')
ax.text(bounds[1], 525, ' two clusters', va='center', fontdict={'size': 15})
ax.text(bounds[1], 325, ' three clusters', va='center', fontdict={'size': 15})
plt.xlabel("Scores")
plt.ylabel("Distance")

In [148]:
identified_cluster=kmeans.fit_predict(x)

In [149]:
student_cluster=x.copy()
student_cluster['clusters_pred']=identified_cluster
student_cluster

In [150]:
plt.scatter(student_cluster['math score'],student_cluster['reading score'],
            student_cluster['writing score'],
            c=student_cluster['clusters_pred'],cmap='rainbow')
plt.ylabel('topper')
plt.xlabel('score')

In [151]:
plt.scatter(student_cluster['math score'],student_cluster['reading score'],
    student_cluster['writing score'],c=identified_cluster, cmap=mglearn.cm3)
plt.ylabel('topper')
plt.xlabel('score')

# Agglomerative Cluster

In [152]:
from sklearn.cluster import AgglomerativeClustering
X=data.iloc[:,5:7].values
y=data.iloc[:,8].values
agg = AgglomerativeClustering(n_clusters=3)
assignment = agg.fit_predict(X)
mglearn.discrete_scatter(X[:, 0], X[:, 1], assignment)
plt.xlabel("Scores")
plt.ylabel("Topper")

# DBSCAN

In [153]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
scaler.fit(X)
X_scaled = scaler.transform(X)
dbscan = DBSCAN()
clusters = dbscan.fit_predict(X_scaled)
# plot the cluster assignments
plt.scatter(X_scaled[:, 0], X_scaled[:, 1],
            c=clusters, cmap=mglearn.cm2, s=60)
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")

#  Exploring cluster and select number of cluster

In [154]:
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.cluster import DBSCAN
X=data.iloc[:,5:7].values
y=data.iloc[:,8].values
# rescale the data to zero mean and unit variance
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
fig, axes = plt.subplots(1, 4, figsize=(15, 3),
subplot_kw={'xticks': (), 'yticks': ()})
# make a list of algorithms to use
algorithms = [KMeans(n_clusters=3),
              AgglomerativeClustering(n_clusters=3),DBSCAN()]
# create a random cluster assignment for reference
random_state = np.random.RandomState(seed=0)
random_clusters = random_state.randint(low=0, high=2, size=len(X))
# plot random assignment
axes[0].scatter(X_scaled[:, 0], X_scaled[:, 1], c=random_clusters,
cmap=mglearn.cm3, s=60)
axes[0].set_title("Random assignment - ARI: {:.2f}".format(
adjusted_rand_score(y, random_clusters)))
for ax, algorithm in zip(axes[1:], algorithms):
# plot the cluster assignments and cluster centers
    clusters = algorithm.fit_predict(X_scaled)
    ax.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters,
    cmap=mglearn.cm3, s=60)
    ax.set_title("{} - ARI: {:.2f}".format(algorithm.__class__.__name__,
    adjusted_rand_score(y, clusters)))

In [155]:
X

# Cluster

In [156]:
kmeans_new=KMeans(3)
kmeans_new.fit(x_scaled)
clusters_new=x.copy()
clusters_new['clusters_pred']=kmeans_new.fit_predict(x_scaled)

In [157]:
plt.scatter(clusters_new['math score'],clusters_new['reading score'],
            clusters_new['writing score'],
            c=clusters_new['clusters_pred'],cmap='rainbow')
plt.ylabel('topper')
plt.xlabel('score')

# Dimensionality Reduction

In [158]:
import sys
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy as sp
import sklearn
from sklearn.decomposition import PCA

In [159]:
data=pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv') #Read the data 

In [160]:
data['average_score']=(data['math score']+data['reading score']
                       +data['writing score'])//3
data.average_score.value_counts()
data.head()

In [161]:
#Converting string to numeric data
#gender
data=data.replace({'male':1,'female':0})
#race/ethinic group
data= data.replace({'group A':0, 'group B':1,
                'group C':2, 'group D':3, 'group E':4})
#lunch
data= data.replace({'free/reduced':0, 'standard':1})
#test preparation course
data= data.replace({'none':0, 'completed':1})
#parents education
data= data.replace({'some college':3, "associate's degree":2, 'high school':4,
                    'some high school':5, "bachelor's degree":1, "master's degree":0})

In [162]:
y=data['average_score'] #dependent variables
X=data[data.columns.difference(['average_score'])] #dependent variables
X.head()

In [163]:
labels=['Fail','Pass']
bins=[0,75,100] #range 0>75 , 75>100
y=pd.cut(y,bins,labels=labels)

In [164]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

# plot the cluster assignments
from sklearn.decomposition import PCA
# keep the first two principal components of the data
pca = PCA(n_components=2)
# fit PCA model to breast cancer data
pca.fit(X_scaled)
# transform data onto the first two principal components
X_pca = pca.transform(X_scaled)
print("Original shape: {}".format(str(X_scaled.shape)))
print("Reduced shape: {}".format(str(X_pca.shape)))


In [165]:
X_scaled

In [166]:
y=data['average_score'] #independent variables
X=data[data.columns.difference(['average_score'])] #dependent variables


In [167]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,
                            y,test_size=0.2,shuffle=True,random_state=1)

In [168]:
from pandas.plotting import scatter_matrix# to input color
student_dataframe = pd.DataFrame(X_train, 
                    columns=['math score','writing score','reading score'])
grr = scatter_matrix(student_dataframe,c=y_train,figsize=(15, 15), marker='o',
hist_kwds={'bins': 20}, s=60, alpha=.8, cmap=mglearn.cm3)

In [169]:
c1=data["gender"]

In [170]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
df = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv').loc[:,
            ['math score','reading score','writing score']]
df.columns = ['Math', 'Reading', 'Writing']

# plot the original data:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
xs = df['Math']
ys = df['Reading']
zs = df['Writing']
color_control = (df['Math'] + df['Reading'] + df['Writing']) / 300
ax.scatter(xs, ys, zs, marker='o', c=color_control, cmap='inferno', edgecolors='black')
ax.set_xlabel('Math')
ax.set_ylabel('Reading')
ax.set_zlabel('Writing')
ax.set_title('scores in math, reading and writing')
plt.tight_layout()
plt.savefig('3d')
plt.show()

In [171]:
print("PCA component shape: {}".format(pca.components_.shape))

In [172]:
print("PCA components:\n{}".format(pca.components_))

In [173]:
data=data.drop(['average_score'], axis=1)
plt.matshow(pca.components_, cmap='viridis')
plt.yticks([0, 1], ["First component", "Second component"])
plt.colorbar()
plt.xticks(range(len(data.columns)),
data.columns, rotation=60, ha='left')
plt.xlabel("Feature")
plt.ylabel("Principal components")