In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df=pd.read_csv("/kaggle/input/students-performance-in-exams/StudentsPerformance.csv")
df.head()

## Data Analysis and Visualizations

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe().T

In [None]:
df.describe(include="object")

In [None]:
sns.pairplot(df, hue="gender")
plt.show()

In [None]:
df_categorical=["gender", "race/ethnicity", "parental level of education","lunch", "test preparation course"]
for i in df_categorical:
    print("-------------------------------")
    print("Value Counts of  {}." .format( df[i].value_counts()))

In [None]:
df["average"]=df[["math score", "reading score", "writing score"]].mean(axis=1)
df.head()

##### Gender and Race/Ethnicity

In [None]:
fig, axis=plt.subplots(3, figsize=(16,12))
sns.set_palette("pastel")

sns.barplot(data=df, x=df["race/ethnicity"], y=df["average"], hue=df["gender"], ax=axis[0])
sns.violinplot(data=df, x=df["race/ethnicity"], y=df["average"], hue=df["gender"],ax=axis[1])
sns.pointplot(data=df, x=df["race/ethnicity"], y=df["average"], hue=df["gender"],ax=axis[2])

##### Gender and Lunch

In [None]:
fig, axis=plt.subplots(3, figsize=(16,12))
sns.set_palette("pastel")

sns.barplot(data=df, x=df["lunch"], y=df["average"], hue=df["gender"], ax=axis[0])
sns.violinplot(data=df, x=df["lunch"], y=df["average"], hue=df["gender"],ax=axis[1])
sns.pointplot(data=df, x=df["lunch"], y=df["average"], hue=df["gender"],ax=axis[2])

##### Gender and Parental Level Education

In [None]:
fig, axis=plt.subplots(3, figsize=(16,12))
sns.set_palette("pastel")

sns.barplot(data=df, x=df["parental level of education"], y=df["average"], hue=df["gender"], ax=axis[0])
sns.violinplot(data=df, x=df["parental level of education"], y=df["average"], hue=df["gender"],ax=axis[1])
sns.pointplot(data=df, x=df["parental level of education"], y=df["average"], hue=df["gender"],ax=axis[2])

##### Gender and Test Preparation Course

In [None]:
fig, axis=plt.subplots(3, figsize=(16,12))
sns.set_palette("pastel")

sns.barplot(data=df, x=df["test preparation course"], y=df["average"], hue=df["gender"], ax=axis[0])
sns.violinplot(data=df, x=df["test preparation course"], y=df["average"], hue=df["gender"],ax=axis[1])
sns.pointplot(data=df, x=df["test preparation course"], y=df["average"], hue=df["gender"],ax=axis[2])

****What we can understand from these graphs is that:
   1-Parental level of education and race/ethnicity has low impact for average grade.
   2-Females are much more succesful than males in every situtation.****

##### We need a new column for our prediction model

In [None]:
def NewGrade(AverageScore):
    if (AverageScore>=80): return "A"
    if (AverageScore>=70): return "B"
    if (AverageScore>=60): return "C"
    if (AverageScore>=50): return "D"
    if (AverageScore>=40): return "E"
    else: return "F"
df["grade"]=df.apply(lambda x: NewGrade(x["average"]),axis=1)

##### Analysis of the new  column

In [None]:
plt.figure(figsize=(20,10))
sns.set_palette("pastel")
sns.countplot(data=df, x=df["grade"], hue=df["gender"])

In [None]:
fig=px.histogram(df["grade"], color=df["gender"])
fig.show()

In [None]:
plt.figure(figsize=(7,7))
plt.pie(df["grade"].value_counts().values,
       labels=df["grade"].value_counts().index,
       autopct="%1.1f%%",
       shadow=True)
plt.title("Grade Pie Chart", color="Black",fontsize=20)
plt.show()

In [None]:
sns.catplot(data=df,x="gender",y="average",hue="parental level of education"
            ,kind="bar",height=5)

In [None]:
df_numerical=["math score", "reading score", "writing score", "average"]
for i in df_numerical:
    a=df[i].value_counts().reset_index()
    a.rename(columns={"index":i, i:"count"},inplace=True)
    fig=px.bar(a, x=i, y="count", color="count")
    fig.show()

In [None]:
df["total_score"]=df["math score"]+df["reading score"]+df["writing score"]
df.head()

In [None]:
plt.figure(figsize=(20,12))
sns.distplot(df["total_score"])

In [None]:
plt.figure(figsize=(20,12))
sns.distplot(df["average"])

## Model Prepeation

##### Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

for i in df_categorical:
    df[i]=le.fit_transform(df[i])
    

In [None]:
df["grade"]=le.fit_transform(df["grade"])

In [None]:
df.head()

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(df.corr(),annot=True
           ,linewidths=0.5
           ,fmt=".2f"
           ,cmap="YlGnBu")

In [None]:
df.drop("total_score",axis=1,inplace=True)

In [None]:
a=df.drop("grade",axis=1).corrwith(df.grade).reset_index()
fig=px.bar(a, x="index",y=0)
fig.show()

In [None]:
df.info()

In [None]:
df.head()

## Model for Student Performance Prediction

In [None]:
x=df.drop("grade",axis=1)
y=df["grade"]
print(x.shape)
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.33, random_state=25)

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

#### Standard Scaler

In [None]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
x_train=ss.fit_transform(x_train)
x_test=ss.transform(x_test)

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(x_train, y_train)
y_pred=lr.predict(x_test)
print("Training Accuracy :", lr.score(x_train, y_train)*100)
print("Testing Accuracy :", lr.score(x_test, y_test)*100)

In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test, y_pred)

print(cm)

#### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
knn=KNeighborsClassifier(n_neighbors=1,metric="minkowski")
knn.fit(x_train,y_train)

y_pred=knn.predict(x_test)
cm=confusion_matrix(y_test,y_pred)
print(cm)
print("Training Accuracy :", knn.score(x_train, y_train)*100)
print("Testing Accuracy :", knn.score(x_test, y_test)*100)

#### SVC

In [None]:
from sklearn.svm import SVC
svc=SVC(kernel="poly")
svc.fit(x_train,y_train)

y_pred=svc.predict(x_test)

cm=confusion_matrix(y_test,y_pred)
print(cm)

print("Training Accuracy :", svc.score(x_train, y_train)*100)
print("Testing Accuracy :", svc.score(x_test, y_test)*100)

#### Gaussian

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb=GaussianNB()
gnb.fit(x_train,y_train)
# Tahmin
y_pred=gnb.predict(x_test)
cm=confusion_matrix(y_test,y_pred)
print(cm)
print("Training Accuracy :", gnb.score(x_train, y_train)*100)
print("Testing Accuracy :", gnb.score(x_test, y_test)*100)

#### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier(criterion="entropy")
dtc.fit(x_train,y_train)
y_pred=dtc.predict(x_test)
cm=confusion_matrix(y_test,y_pred)
print(cm)
print("Training Accuracy :", dtc.score(x_train, y_train)*100)
print("Testing Accuracy :", dtc.score(x_test, y_test)*100)

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=10,criterion="entropy")
rfc.fit(x_train,y_train)
y_pred=rfc.predict(x_test)
cm=confusion_matrix(y_test,y_pred)
print(cm)
print("Training Accuracy :", rfc.score(x_train, y_train)*100)
print("Testing Accuracy :", rfc.score(x_test, y_test)*100)