In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
import plotly.express as px
import pandas_profiling as pp
from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("/kaggle/input/students-performance-in-exams/StudentsPerformance.csv")
df.head()

In [None]:
df[df["math score"] == 0]

<h1>Exploratory Data Analysis</h1>

In [None]:
pp.ProfileReport(df)

In [None]:
px.pie(data_frame=df, names="gender")

In [None]:
df.head()

In [None]:
sns.histplot(data=df, x=df[df["gender"] == "female"]["math score"],color="red")
sns.histplot(data=df, x=df[df["gender"] == "male"]["math score"],color="black");

In [None]:
df["parental level of education"].value_counts()

In [None]:
plt.figure(figsize=(20,6))
sns.set_theme()
sns.violinplot(data=df, x="parental level of education", y="math score", hue="gender",  palette="muted", split=True);
plt.ylim(0, 120);

In [None]:
plt.figure(figsize=(20,6))
sns.set_theme()
sns.violinplot(data=df, x="parental level of education", y="reading score", hue="gender",  palette="muted", split=True);
plt.ylim(0, 120);

In [None]:
plt.figure(figsize=(20,6))
sns.set_theme()
sns.violinplot(data=df, x="parental level of education", y="writing score", hue="gender",  palette="muted", split=True);
plt.ylim(0, 120);

In [None]:
df.head()

<h1>Feature Engineering</h1>

In [None]:
model_data = pd.get_dummies(data=df, columns=["gender", "race/ethnicity", "parental level of education", "lunch", "test preparation course"], drop_first=True)

In [None]:
model_data.head()

In [None]:
X = model_data.drop(labels=["math score", "reading score", "writing score"], axis=1)
y = model_data["math score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

<h1>Machine Learning Models</h1>

In [None]:
# Linear Regression
model = LinearRegression().fit(X_train, y_train)
predictions = model.predict(X_test)
results = pd.DataFrame(y_test)
results["predicted_math_score"] = predictions
results["residuals"] = results["math score"] - results["predicted_math_score"]
results.head()

In [None]:
sns.histplot(data=results, x="residuals")

In [None]:
print(f'Mean Absolute Variance: {mean_absolute_error(y_test, predictions)}')
print(f'Mean Squared Variance: {(mean_squared_error(y_test, predictions))**0.5}')
print(f'Explained Variance Score: {explained_variance_score(y_test, predictions)}')

In [None]:
# Decision Tree Model
model = DecisionTreeRegressor().fit(X_train, y_train)
predictions = model.predict(X_test)
results = pd.DataFrame(y_test)
results["predicted_math_score"] = predictions
results["residuals"] = results["math score"] - results["predicted_math_score"]
results.head()

In [None]:
sns.histplot(data=results, x="residuals")

In [None]:
print(f'Mean Absolute Variance: {mean_absolute_error(y_test, predictions)}')
print(f'Mean Squared Variance: {(mean_squared_error(y_test, predictions))**0.5}')
print(f'Explained Variance Score: {explained_variance_score(y_test, predictions)}')

In [None]:
# RANDOM FOREST MODEL
for i in range(150, 500, 50):
    model = RandomForestRegressor(n_estimators=i).fit(X_train, y_train)
    predictions = model.predict(X_test)
    results = pd.DataFrame(y_test)
    results["predicted_math_score"] = predictions
    print(f'Mean Absolute Variance at i = {i}: {mean_absolute_error(y_test, predictions)}')
    print(f'Root Mean Squared Variance at i = {i}: {(mean_squared_error(y_test, predictions))**0.5}')
    print(f'Explained Variance Score at i = {i}: {explained_variance_score(y_test, predictions)}')
    print("\n")


In [None]:
# K Nearest Neighbors
for i in range(2, 11):
    model = KNeighborsRegressor(n_neighbors=i).fit(X_train, y_train)
    predictions = model.predict(X_test)
    results = pd.DataFrame(y_test)
    results["predicted_math_score"] = predictions
    print(f'Mean Absolute Variance at i = {i}: {mean_absolute_error(y_test, predictions)}')
    print(f'Root Mean Squared Variance at i = {i}: {(mean_squared_error(y_test, predictions))**0.5}')
    print(f'Explained Variance Score at i = {i}: {explained_variance_score(y_test, predictions)}')
    print("\n")

<h1>Deep Learning Model</h1>

In [None]:
pd.DataFrame(X_train)

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras import layers

In [None]:
model = Sequential()

model.add(layers.Dense(6, activation="linear"))
model.add(layers.Dropout(rate=0.3))
model.add(layers.Dense(50, activation="linear"))
model.add(layers.Dropout(rate=0.3))
model.add(layers.Dense(180, activation="linear"))
model.add(layers.Dropout(rate=0.3))

model.add(layers.Dense(1))

In [None]:
model.compile(optimizer="adam", loss="mse")
model.fit(tf.stack(X_train), tf.stack(y_train),
          epochs=50, 
          validation_data=(tf.stack(X_test), tf.stack(y_test)), 
          batch_size=20   
         )